[llvm] r316938 - [X86][AVX512] Cleanup scheduler tests - split GENERIC and SKX targets

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Mon Oct 30 11:37:27 PDT 2017


Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll?rev=316938&r1=316937&r2=316938&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll Mon Oct 30 11:37:27 2017
@@ -1,25 +1,42 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx512f,+avx512dq,+avx512bw,+avx512vl | FileCheck %s --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=SKX
+
 ; This test is an assembly of avx512 shuffling instructions to check their scheduling
 
 define <16 x i16> @test_16xi16_perm_mask0(<16 x i16> %vec) {
-; CHECK-LABEL: test_16xi16_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
-; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0 # sched: [6:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi16_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
+; GENERIC-NEXT:    vpermw %ymm0, %ymm1, %ymm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi16_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
+; SKX-NEXT:    vpermw %ymm0, %ymm1, %ymm0 # sched: [6:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
   ret <16 x i16> %res
 }
 define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_16xi16_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi16_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1
+; GENERIC-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
@@ -27,27 +44,44 @@ define <16 x i16> @test_masked_16xi16_pe
 }
 
 define <16 x i16> @test_masked_z_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_16xi16_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi16_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1
+; GENERIC-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
 define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_16xi16_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi16_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1
+; GENERIC-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
@@ -55,27 +89,44 @@ define <16 x i16> @test_masked_16xi16_pe
 }
 
 define <16 x i16> @test_masked_z_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_16xi16_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi16_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1
+; GENERIC-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
 define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_16xi16_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi16_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1
+; GENERIC-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
@@ -83,36 +134,59 @@ define <16 x i16> @test_masked_16xi16_pe
 }
 
 define <16 x i16> @test_masked_z_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_16xi16_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi16_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1
+; GENERIC-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
 define <16 x i16> @test_16xi16_perm_mask3(<16 x i16> %vec) {
-; CHECK-LABEL: test_16xi16_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
-; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0 # sched: [6:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi16_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
+; GENERIC-NEXT:    vpermw %ymm0, %ymm1, %ymm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi16_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
+; SKX-NEXT:    vpermw %ymm0, %ymm1, %ymm0 # sched: [6:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
   ret <16 x i16> %res
 }
 define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_16xi16_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi16_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1
+; GENERIC-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1}
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
@@ -120,36 +194,58 @@ define <16 x i16> @test_masked_16xi16_pe
 }
 
 define <16 x i16> @test_masked_z_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_16xi16_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi16_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1
+; GENERIC-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
 define <16 x i16> @test_16xi16_perm_mem_mask0(<16 x i16>* %vp) {
-; CHECK-LABEL: test_16xi16_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
-; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi16_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
+; GENERIC-NEXT:    vpermw (%rdi), %ymm0, %ymm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi16_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
+; SKX-NEXT:    vpermw (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
   ret <16 x i16> %res
 }
 define <16 x i16> @test_masked_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_16xi16_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1
+; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -158,13 +254,21 @@ define <16 x i16> @test_masked_16xi16_pe
 }
 
 define <16 x i16> @test_masked_z_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1
+; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -173,13 +277,21 @@ define <16 x i16> @test_masked_z_16xi16_
 }
 
 define <16 x i16> @test_masked_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_16xi16_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1
+; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -188,13 +300,21 @@ define <16 x i16> @test_masked_16xi16_pe
 }
 
 define <16 x i16> @test_masked_z_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1
+; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -203,13 +323,21 @@ define <16 x i16> @test_masked_z_16xi16_
 }
 
 define <16 x i16> @test_masked_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_16xi16_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1
+; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -218,13 +346,21 @@ define <16 x i16> @test_masked_16xi16_pe
 }
 
 define <16 x i16> @test_masked_z_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1
+; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -233,23 +369,37 @@ define <16 x i16> @test_masked_z_16xi16_
 }
 
 define <16 x i16> @test_16xi16_perm_mem_mask3(<16 x i16>* %vp) {
-; CHECK-LABEL: test_16xi16_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
-; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi16_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
+; GENERIC-NEXT:    vpermw (%rdi), %ymm0, %ymm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi16_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
+; SKX-NEXT:    vpermw (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
   ret <16 x i16> %res
 }
 define <16 x i16> @test_masked_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_16xi16_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1
+; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -258,13 +408,21 @@ define <16 x i16> @test_masked_16xi16_pe
 }
 
 define <16 x i16> @test_masked_z_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1
+; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -273,23 +431,38 @@ define <16 x i16> @test_masked_z_16xi16_
 }
 
 define <32 x i16> @test_32xi16_perm_mask0(<32 x i16> %vec) {
-; CHECK-LABEL: test_32xi16_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50]
-; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0 # sched: [6:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_32xi16_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [4:0.50]
+; GENERIC-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi16_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50]
+; SKX-NEXT:    vpermw %zmm0, %zmm1, %zmm0 # sched: [6:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
   ret <32 x i16> %res
 }
 define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_32xi16_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi16_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1
+; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
@@ -297,27 +470,44 @@ define <32 x i16> @test_masked_32xi16_pe
 }
 
 define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_32xi16_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi16_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
 }
 define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_32xi16_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi16_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1
+; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
@@ -325,27 +515,44 @@ define <32 x i16> @test_masked_32xi16_pe
 }
 
 define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_32xi16_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi16_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
 }
 define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_32xi16_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi16_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1
+; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
@@ -353,36 +560,59 @@ define <32 x i16> @test_masked_32xi16_pe
 }
 
 define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_32xi16_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi16_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
 }
 define <32 x i16> @test_32xi16_perm_mask3(<32 x i16> %vec) {
-; CHECK-LABEL: test_32xi16_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50]
-; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0 # sched: [6:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_32xi16_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [4:0.50]
+; GENERIC-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi16_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50]
+; SKX-NEXT:    vpermw %zmm0, %zmm1, %zmm0 # sched: [6:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
   ret <32 x i16> %res
 }
 define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_32xi16_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi16_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1
+; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1}
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
@@ -390,36 +620,58 @@ define <32 x i16> @test_masked_32xi16_pe
 }
 
 define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_32xi16_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi16_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
 }
 define <32 x i16> @test_32xi16_perm_mem_mask0(<32 x i16>* %vp) {
-; CHECK-LABEL: test_32xi16_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50]
-; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0 # sched: [13:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_32xi16_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [4:0.50]
+; GENERIC-NEXT:    vpermw (%rdi), %zmm0, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi16_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50]
+; SKX-NEXT:    vpermw (%rdi), %zmm0, %zmm0 # sched: [13:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
   ret <32 x i16> %res
 }
 define <32 x i16> @test_masked_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_32xi16_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -428,13 +680,21 @@ define <32 x i16> @test_masked_32xi16_pe
 }
 
 define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1
+; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -443,13 +703,21 @@ define <32 x i16> @test_masked_z_32xi16_
 }
 
 define <32 x i16> @test_masked_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_32xi16_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 31, i32 20, i32 2, i32 2, i32 23, i32 1, i32 0, i32 12, i32 16, i32 14, i32 15, i32 18, i32 21, i32 13, i32 11, i32 31, i32 8, i32 24, i32 13, i32 11, i32 2, i32 27, i32 22, i32 28, i32 14, i32 21, i32 3, i32 12, i32 6, i32 1, i32 30, i32 6>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -458,13 +726,21 @@ define <32 x i16> @test_masked_32xi16_pe
 }
 
 define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1
+; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 31, i32 20, i32 2, i32 2, i32 23, i32 1, i32 0, i32 12, i32 16, i32 14, i32 15, i32 18, i32 21, i32 13, i32 11, i32 31, i32 8, i32 24, i32 13, i32 11, i32 2, i32 27, i32 22, i32 28, i32 14, i32 21, i32 3, i32 12, i32 6, i32 1, i32 30, i32 6>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -473,13 +749,21 @@ define <32 x i16> @test_masked_z_32xi16_
 }
 
 define <32 x i16> @test_masked_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_32xi16_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 4, i32 6, i32 12, i32 17, i32 4, i32 31, i32 31, i32 4, i32 12, i32 21, i32 28, i32 15, i32 29, i32 10, i32 15, i32 15, i32 21, i32 6, i32 19, i32 7, i32 10, i32 30, i32 28, i32 26, i32 1, i32 4, i32 8, i32 25, i32 26, i32 18, i32 22, i32 25>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -488,13 +772,21 @@ define <32 x i16> @test_masked_32xi16_pe
 }
 
 define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1
+; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 4, i32 6, i32 12, i32 17, i32 4, i32 31, i32 31, i32 4, i32 12, i32 21, i32 28, i32 15, i32 29, i32 10, i32 15, i32 15, i32 21, i32 6, i32 19, i32 7, i32 10, i32 30, i32 28, i32 26, i32 1, i32 4, i32 8, i32 25, i32 26, i32 18, i32 22, i32 25>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -503,23 +795,37 @@ define <32 x i16> @test_masked_z_32xi16_
 }
 
 define <32 x i16> @test_32xi16_perm_mem_mask3(<32 x i16>* %vp) {
-; CHECK-LABEL: test_32xi16_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50]
-; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0 # sched: [13:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_32xi16_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [4:0.50]
+; GENERIC-NEXT:    vpermw (%rdi), %zmm0, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi16_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50]
+; SKX-NEXT:    vpermw (%rdi), %zmm0, %zmm0 # sched: [13:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
   ret <32 x i16> %res
 }
 define <32 x i16> @test_masked_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_32xi16_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -528,13 +834,21 @@ define <32 x i16> @test_masked_32xi16_pe
 }
 
 define <32 x i16> @test_masked_z_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1
+; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -543,23 +857,38 @@ define <32 x i16> @test_masked_z_32xi16_
 }
 
 define <8 x i32> @test_8xi32_perm_mask0(<8 x i32> %vec) {
-; CHECK-LABEL: test_8xi32_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
-; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi32_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
+; GENERIC-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
+; SKX-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
   ret <8 x i32> %res
 }
 define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_8xi32_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi32_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1
+; GENERIC-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi32_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
@@ -567,27 +896,44 @@ define <8 x i32> @test_masked_8xi32_perm
 }
 
 define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_8xi32_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi32_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; GENERIC-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi32_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_8xi32_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi32_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1
+; GENERIC-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi32_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
@@ -595,27 +941,44 @@ define <8 x i32> @test_masked_8xi32_perm
 }
 
 define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_8xi32_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi32_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; GENERIC-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi32_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_8xi32_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi32_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1
+; GENERIC-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi32_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
@@ -623,36 +986,59 @@ define <8 x i32> @test_masked_8xi32_perm
 }
 
 define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_8xi32_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi32_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; GENERIC-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi32_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 define <8 x i32> @test_8xi32_perm_mask3(<8 x i32> %vec) {
-; CHECK-LABEL: test_8xi32_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
-; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi32_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
+; GENERIC-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
+; SKX-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
   ret <8 x i32> %res
 }
 define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_8xi32_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi32_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1
+; GENERIC-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1}
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi32_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
@@ -660,36 +1046,58 @@ define <8 x i32> @test_masked_8xi32_perm
 }
 
 define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_8xi32_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi32_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; GENERIC-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi32_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 define <8 x i32> @test_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
-; CHECK-LABEL: test_8xi32_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
-; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi32_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
+; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
+; SKX-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
   ret <8 x i32> %res
 }
 define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_8xi32_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi32_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -698,13 +1106,21 @@ define <8 x i32> @test_masked_8xi32_perm
 }
 
 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1
+; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -713,13 +1129,21 @@ define <8 x i32> @test_masked_z_8xi32_pe
 }
 
 define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_8xi32_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi32_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 6, i32 1, i32 7, i32 6, i32 7, i32 6, i32 5>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -728,13 +1152,21 @@ define <8 x i32> @test_masked_8xi32_perm
 }
 
 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1
+; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 6, i32 1, i32 7, i32 6, i32 7, i32 6, i32 5>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -743,13 +1175,21 @@ define <8 x i32> @test_masked_z_8xi32_pe
 }
 
 define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_8xi32_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi32_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 4, i32 6, i32 1, i32 6, i32 3, i32 6, i32 3>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -758,13 +1198,21 @@ define <8 x i32> @test_masked_8xi32_perm
 }
 
 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1
+; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 4, i32 6, i32 1, i32 6, i32 3, i32 6, i32 3>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -773,23 +1221,37 @@ define <8 x i32> @test_masked_z_8xi32_pe
 }
 
 define <8 x i32> @test_8xi32_perm_mem_mask3(<8 x i32>* %vp) {
-; CHECK-LABEL: test_8xi32_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
-; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi32_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
+; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
+; SKX-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
   ret <8 x i32> %res
 }
 define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_8xi32_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi32_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -798,13 +1260,21 @@ define <8 x i32> @test_masked_8xi32_perm
 }
 
 define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1
+; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -813,23 +1283,38 @@ define <8 x i32> @test_masked_z_8xi32_pe
 }
 
 define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xi32_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
-; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi32_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [4:0.50]
+; GENERIC-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
+; SKX-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
   ret <16 x i32> %res
 }
 define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_16xi32_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi32_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1
+; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi32_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
@@ -837,27 +1322,44 @@ define <16 x i32> @test_masked_16xi32_pe
 }
 
 define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_16xi32_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi32_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi32_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_16xi32_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi32_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1
+; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi32_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
@@ -865,27 +1367,44 @@ define <16 x i32> @test_masked_16xi32_pe
 }
 
 define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_16xi32_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi32_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi32_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_16xi32_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi32_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1
+; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi32_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
@@ -893,36 +1412,59 @@ define <16 x i32> @test_masked_16xi32_pe
 }
 
 define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_16xi32_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi32_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi32_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) {
-; CHECK-LABEL: test_16xi32_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
-; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi32_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [4:0.50]
+; GENERIC-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
+; SKX-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
   ret <16 x i32> %res
 }
 define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_16xi32_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi32_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1
+; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1}
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi32_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
@@ -930,36 +1472,58 @@ define <16 x i32> @test_masked_16xi32_pe
 }
 
 define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_16xi32_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi32_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi32_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 define <16 x i32> @test_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
-; CHECK-LABEL: test_16xi32_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
-; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi32_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [4:0.50]
+; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
+; SKX-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
   ret <16 x i32> %res
 }
 define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_16xi32_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi32_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -968,13 +1532,21 @@ define <16 x i32> @test_masked_16xi32_pe
 }
 
 define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
+; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -983,13 +1555,21 @@ define <16 x i32> @test_masked_z_16xi32_
 }
 
 define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_16xi32_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi32_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 5, i32 3, i32 4, i32 7, i32 15, i32 12, i32 4, i32 8, i32 11, i32 12, i32 7, i32 6, i32 12, i32 6, i32 3>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -998,13 +1578,21 @@ define <16 x i32> @test_masked_16xi32_pe
 }
 
 define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
+; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 5, i32 3, i32 4, i32 7, i32 15, i32 12, i32 4, i32 8, i32 11, i32 12, i32 7, i32 6, i32 12, i32 6, i32 3>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -1013,13 +1601,21 @@ define <16 x i32> @test_masked_z_16xi32_
 }
 
 define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_16xi32_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi32_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 14, i32 2, i32 7, i32 10, i32 7, i32 3, i32 0, i32 11, i32 9, i32 0, i32 4, i32 12, i32 10, i32 8, i32 2>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -1028,13 +1624,21 @@ define <16 x i32> @test_masked_16xi32_pe
 }
 
 define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
+; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 14, i32 2, i32 7, i32 10, i32 7, i32 3, i32 0, i32 11, i32 9, i32 0, i32 4, i32 12, i32 10, i32 8, i32 2>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -1043,23 +1647,37 @@ define <16 x i32> @test_masked_z_16xi32_
 }
 
 define <16 x i32> @test_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
-; CHECK-LABEL: test_16xi32_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
-; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi32_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [4:0.50]
+; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
+; SKX-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
   ret <16 x i32> %res
 }
 define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_16xi32_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi32_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -1068,13 +1686,21 @@ define <16 x i32> @test_masked_16xi32_pe
 }
 
 define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
+; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -1083,21 +1709,34 @@ define <16 x i32> @test_masked_z_16xi32_
 }
 
 define <4 x i64> @test_4xi64_perm_mask0(<4 x i64> %vec) {
-; CHECK-LABEL: test_4xi64_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi64_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1] sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
   ret <4 x i64> %res
 }
 define <4 x i64> @test_masked_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_4xi64_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_4xi64_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1]
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi64_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] sched: [3:1.00]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
@@ -1105,25 +1744,40 @@ define <4 x i64> @test_masked_4xi64_perm
 }
 
 define <4 x i64> @test_masked_z_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_4xi64_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_4xi64_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi64_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 define <4 x i64> @test_masked_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_4xi64_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_4xi64_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3]
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi64_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] sched: [3:1.00]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
@@ -1131,25 +1785,40 @@ define <4 x i64> @test_masked_4xi64_perm
 }
 
 define <4 x i64> @test_masked_z_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_4xi64_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_4xi64_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi64_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 define <4 x i64> @test_masked_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_4xi64_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_4xi64_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1]
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi64_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] sched: [3:1.00]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
@@ -1157,33 +1826,53 @@ define <4 x i64> @test_masked_4xi64_perm
 }
 
 define <4 x i64> @test_masked_z_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_4xi64_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_4xi64_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi64_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 define <4 x i64> @test_4xi64_perm_mask3(<4 x i64> %vec) {
-; CHECK-LABEL: test_4xi64_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi64_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
   ret <4 x i64> %res
 }
 define <4 x i64> @test_masked_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_4xi64_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_4xi64_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3]
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi64_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] sched: [3:1.00]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
@@ -1191,33 +1880,52 @@ define <4 x i64> @test_masked_4xi64_perm
 }
 
 define <4 x i64> @test_masked_z_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_4xi64_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_4xi64_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi64_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 define <4 x i64> @test_4xi64_perm_mem_mask0(<4 x i64>* %vp) {
-; CHECK-LABEL: test_4xi64_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,1,2,0] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi64_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,1,2,0] sched: [5:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,1,2,0] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
   %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
   ret <4 x i64> %res
 }
 define <4 x i64> @test_masked_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_4xi64_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi64_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1226,12 +1934,19 @@ define <4 x i64> @test_masked_4xi64_perm
 }
 
 define <4 x i64> @test_masked_z_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1240,12 +1955,19 @@ define <4 x i64> @test_masked_z_4xi64_pe
 }
 
 define <4 x i64> @test_masked_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_4xi64_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi64_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1254,12 +1976,19 @@ define <4 x i64> @test_masked_4xi64_perm
 }
 
 define <4 x i64> @test_masked_z_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1268,12 +1997,19 @@ define <4 x i64> @test_masked_z_4xi64_pe
 }
 
 define <4 x i64> @test_masked_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_4xi64_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi64_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1282,12 +2018,19 @@ define <4 x i64> @test_masked_4xi64_perm
 }
 
 define <4 x i64> @test_masked_z_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1296,21 +2039,33 @@ define <4 x i64> @test_masked_z_4xi64_pe
 }
 
 define <4 x i64> @test_4xi64_perm_mem_mask3(<4 x i64>* %vp) {
-; CHECK-LABEL: test_4xi64_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,0,1,3] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi64_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,0,1,3] sched: [5:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,0,1,3] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
   %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
   ret <4 x i64> %res
 }
 define <4 x i64> @test_masked_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_4xi64_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi64_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1319,12 +2074,19 @@ define <4 x i64> @test_masked_4xi64_perm
 }
 
 define <4 x i64> @test_masked_z_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i64>, <4 x i64>* %vp
   %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -1333,23 +2095,38 @@ define <4 x i64> @test_masked_z_4xi64_pe
 }
 
 define <8 x i64> @test_8xi64_perm_mask0(<8 x i64> %vec) {
-; CHECK-LABEL: test_8xi64_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] sched: [8:0.50]
-; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi64_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] sched: [4:0.50]
+; GENERIC-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] sched: [8:0.50]
+; SKX-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
   ret <8 x i64> %res
 }
 define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xi64_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi64_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1
+; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
@@ -1357,26 +2134,42 @@ define <8 x i64> @test_masked_8xi64_perm
 }
 
 define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xi64_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi64_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xi64_perm_imm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_imm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
@@ -1384,26 +2177,42 @@ define <8 x i64> @test_masked_8xi64_perm
 }
 
 define <8 x i64> @test_masked_z_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xi64_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi64_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1
+; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
@@ -1411,34 +2220,55 @@ define <8 x i64> @test_masked_8xi64_perm
 }
 
 define <8 x i64> @test_masked_z_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xi64_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi64_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 define <8 x i64> @test_8xi64_perm_imm_mask3(<8 x i64> %vec) {
-; CHECK-LABEL: test_8xi64_perm_imm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi64_perm_imm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,1,3,1,7,5,7,5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_perm_imm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
   ret <8 x i64> %res
 }
 define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xi64_perm_imm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_imm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
@@ -1446,26 +2276,42 @@ define <8 x i64> @test_masked_8xi64_perm
 }
 
 define <8 x i64> @test_masked_z_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xi64_perm_mask4:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi64_perm_mask4:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1
+; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_mask4:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
@@ -1473,26 +2319,42 @@ define <8 x i64> @test_masked_8xi64_perm
 }
 
 define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xi64_perm_mask4:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi64_perm_mask4:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_mask4:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xi64_perm_imm_mask5:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask5:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_imm_mask5:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
@@ -1500,35 +2362,57 @@ define <8 x i64> @test_masked_8xi64_perm
 }
 
 define <8 x i64> @test_masked_z_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask5:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask5:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask5:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 define <8 x i64> @test_8xi64_perm_mask6(<8 x i64> %vec) {
-; CHECK-LABEL: test_8xi64_perm_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] sched: [8:0.50]
-; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi64_perm_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] sched: [4:0.50]
+; GENERIC-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_perm_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] sched: [8:0.50]
+; SKX-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
   ret <8 x i64> %res
 }
 define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xi64_perm_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi64_perm_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1
+; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1}
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
@@ -1536,26 +2420,42 @@ define <8 x i64> @test_masked_8xi64_perm
 }
 
 define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xi64_perm_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi64_perm_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xi64_perm_imm_mask7:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask7:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_imm_mask7:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
@@ -1563,35 +2463,56 @@ define <8 x i64> @test_masked_8xi64_perm
 }
 
 define <8 x i64> @test_masked_z_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask7:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask7:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask7:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 define <8 x i64> @test_8xi64_perm_mem_mask0(<8 x i64>* %vp) {
-; CHECK-LABEL: test_8xi64_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [8:0.50]
-; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi64_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [4:0.50]
+; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [8:0.50]
+; SKX-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
   ret <8 x i64> %res
 }
 define <8 x i64> @test_masked_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xi64_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -1600,13 +2521,21 @@ define <8 x i64> @test_masked_8xi64_perm
 }
 
 define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
+; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -1615,12 +2544,19 @@ define <8 x i64> @test_masked_z_8xi64_pe
 }
 
 define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 0, i32 5, i32 5, i32 5, i32 4>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -1629,12 +2565,19 @@ define <8 x i64> @test_masked_8xi64_perm
 }
 
 define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 0, i32 5, i32 5, i32 5, i32 4>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -1643,13 +2586,21 @@ define <8 x i64> @test_masked_z_8xi64_pe
 }
 
 define <8 x i64> @test_masked_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xi64_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 1, i32 4, i32 1, i32 1, i32 5, i32 5>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -1658,13 +2609,21 @@ define <8 x i64> @test_masked_8xi64_perm
 }
 
 define <8 x i64> @test_masked_z_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
+; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 1, i32 4, i32 1, i32 1, i32 5, i32 5>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -1673,21 +2632,33 @@ define <8 x i64> @test_masked_z_8xi64_pe
 }
 
 define <8 x i64> @test_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp) {
-; CHECK-LABEL: test_8xi64_perm_imm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi64_perm_imm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_perm_imm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
   ret <8 x i64> %res
 }
 define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -1696,12 +2667,19 @@ define <8 x i64> @test_masked_8xi64_perm
 }
 
 define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -1710,13 +2688,21 @@ define <8 x i64> @test_masked_z_8xi64_pe
 }
 
 define <8 x i64> @test_masked_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xi64_perm_mem_mask4:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask4:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_mem_mask4:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 0, i32 7, i32 0, i32 3, i32 5, i32 0, i32 6>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -1725,13 +2711,21 @@ define <8 x i64> @test_masked_8xi64_perm
 }
 
 define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask4:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask4:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
+; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask4:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 0, i32 7, i32 0, i32 3, i32 5, i32 0, i32 6>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -1740,12 +2734,19 @@ define <8 x i64> @test_masked_z_8xi64_pe
 }
 
 define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask5:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask5:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask5:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 0, i32 0, i32 7, i32 5, i32 4, i32 4>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -1754,12 +2755,19 @@ define <8 x i64> @test_masked_8xi64_perm
 }
 
 define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 0, i32 0, i32 7, i32 5, i32 4, i32 4>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -1768,23 +2776,37 @@ define <8 x i64> @test_masked_z_8xi64_pe
 }
 
 define <8 x i64> @test_8xi64_perm_mem_mask6(<8 x i64>* %vp) {
-; CHECK-LABEL: test_8xi64_perm_mem_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [8:0.50]
-; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi64_perm_mem_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [4:0.50]
+; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_perm_mem_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [8:0.50]
+; SKX-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
   ret <8 x i64> %res
 }
 define <8 x i64> @test_masked_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xi64_perm_mem_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_mem_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -1793,13 +2815,21 @@ define <8 x i64> @test_masked_8xi64_perm
 }
 
 define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
+; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -1808,12 +2838,19 @@ define <8 x i64> @test_masked_z_8xi64_pe
 }
 
 define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask7:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask7:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask7:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 1, i32 7, i32 4, i32 4, i32 5>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -1822,12 +2859,19 @@ define <8 x i64> @test_masked_8xi64_perm
 }
 
 define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i64>, <8 x i64>* %vp
   %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 1, i32 7, i32 4, i32 4, i32 5>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -1836,23 +2880,38 @@ define <8 x i64> @test_masked_z_8xi64_pe
 }
 
 define <8 x float> @test_8xfloat_perm_mask0(<8 x float> %vec) {
-; CHECK-LABEL: test_8xfloat_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
-; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
+; GENERIC-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
+; SKX-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
   ret <8 x float> %res
 }
 define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_8xfloat_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xfloat_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1
+; GENERIC-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
+; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xfloat_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
@@ -1860,27 +2919,44 @@ define <8 x float> @test_masked_8xfloat_
 }
 
 define <8 x float> @test_masked_z_8xfloat_perm_mask0(<8 x float> %vec, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_8xfloat_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; GENERIC-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xfloat_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 define <8 x float> @test_masked_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_8xfloat_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xfloat_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1
+; GENERIC-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
+; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xfloat_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
@@ -1888,27 +2964,44 @@ define <8 x float> @test_masked_8xfloat_
 }
 
 define <8 x float> @test_masked_z_8xfloat_perm_mask1(<8 x float> %vec, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xfloat_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xfloat_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_8xfloat_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xfloat_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1
+; GENERIC-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
+; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xfloat_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
@@ -1916,36 +3009,59 @@ define <8 x float> @test_masked_8xfloat_
 }
 
 define <8 x float> @test_masked_z_8xfloat_perm_mask2(<8 x float> %vec, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_8xfloat_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; GENERIC-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xfloat_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 define <8 x float> @test_8xfloat_perm_mask3(<8 x float> %vec) {
-; CHECK-LABEL: test_8xfloat_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
-; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
+; GENERIC-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
+; SKX-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
   ret <8 x float> %res
 }
 define <8 x float> @test_masked_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_8xfloat_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xfloat_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1
+; GENERIC-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1}
+; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xfloat_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
@@ -1953,36 +3069,58 @@ define <8 x float> @test_masked_8xfloat_
 }
 
 define <8 x float> @test_masked_z_8xfloat_perm_mask3(<8 x float> %vec, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_8xfloat_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; GENERIC-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xfloat_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 define <8 x float> @test_8xfloat_perm_mem_mask0(<8 x float>* %vp) {
-; CHECK-LABEL: test_8xfloat_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
-; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
+; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
+; SKX-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
   %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
   ret <8 x float> %res
 }
 define <8 x float> @test_masked_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x float> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xfloat_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -1991,13 +3129,21 @@ define <8 x float> @test_masked_8xfloat_
 }
 
 define <8 x float> @test_masked_z_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1
+; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -2006,13 +3152,21 @@ define <8 x float> @test_masked_z_8xfloa
 }
 
 define <8 x float> @test_masked_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x float> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xfloat_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 4, i32 0, i32 6, i32 6, i32 6>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -2021,13 +3175,21 @@ define <8 x float> @test_masked_8xfloat_
 }
 
 define <8 x float> @test_masked_z_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1
+; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 4, i32 0, i32 6, i32 6, i32 6>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -2036,13 +3198,21 @@ define <8 x float> @test_masked_z_8xfloa
 }
 
 define <8 x float> @test_masked_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x float> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xfloat_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 1, i32 5, i32 6, i32 6, i32 2, i32 4>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -2051,13 +3221,21 @@ define <8 x float> @test_masked_8xfloat_
 }
 
 define <8 x float> @test_masked_z_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1
+; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 1, i32 5, i32 6, i32 6, i32 2, i32 4>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -2066,23 +3244,37 @@ define <8 x float> @test_masked_z_8xfloa
 }
 
 define <8 x float> @test_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xfloat_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
-; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
+; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [5:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
+; SKX-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
   %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
   ret <8 x float> %res
 }
 define <8 x float> @test_masked_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x float> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1
+; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xfloat_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -2091,13 +3283,21 @@ define <8 x float> @test_masked_8xfloat_
 }
 
 define <8 x float> @test_masked_z_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1
+; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x float>, <8 x float>* %vp
   %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -2106,23 +3306,38 @@ define <8 x float> @test_masked_z_8xfloa
 }
 
 define <16 x float> @test_16xfloat_perm_mask0(<16 x float> %vec) {
-; CHECK-LABEL: test_16xfloat_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50]
-; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [4:0.50]
+; GENERIC-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50]
+; SKX-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
   ret <16 x float> %res
 }
 define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_16xfloat_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xfloat_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1
+; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xfloat_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
@@ -2130,27 +3345,44 @@ define <16 x float> @test_masked_16xfloa
 }
 
 define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_16xfloat_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xfloat_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_16xfloat_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xfloat_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1
+; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xfloat_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 4, i32 10, i32 4, i32 5, i32 8, i32 11, i32 2, i32 0, i32 10, i32 0, i32 0, i32 3, i32 10, i32 1>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
@@ -2158,27 +3390,44 @@ define <16 x float> @test_masked_16xfloa
 }
 
 define <16 x float> @test_masked_z_16xfloat_perm_mask1(<16 x float> %vec, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_16xfloat_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xfloat_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 4, i32 10, i32 4, i32 5, i32 8, i32 11, i32 2, i32 0, i32 10, i32 0, i32 0, i32 3, i32 10, i32 1>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_16xfloat_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xfloat_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1
+; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xfloat_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 15, i32 6, i32 14, i32 3, i32 6, i32 5, i32 2, i32 5, i32 15, i32 11, i32 6, i32 6, i32 4, i32 8, i32 11>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
@@ -2186,36 +3435,59 @@ define <16 x float> @test_masked_16xfloa
 }
 
 define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_16xfloat_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xfloat_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 15, i32 6, i32 14, i32 3, i32 6, i32 5, i32 2, i32 5, i32 15, i32 11, i32 6, i32 6, i32 4, i32 8, i32 11>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 define <16 x float> @test_16xfloat_perm_mask3(<16 x float> %vec) {
-; CHECK-LABEL: test_16xfloat_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50]
-; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [4:0.50]
+; GENERIC-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50]
+; SKX-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
   ret <16 x float> %res
 }
 define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_16xfloat_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xfloat_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1
+; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1}
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xfloat_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
@@ -2223,36 +3495,58 @@ define <16 x float> @test_masked_16xfloa
 }
 
 define <16 x float> @test_masked_z_16xfloat_perm_mask3(<16 x float> %vec, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_16xfloat_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xfloat_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 define <16 x float> @test_16xfloat_perm_mem_mask0(<16 x float>* %vp) {
-; CHECK-LABEL: test_16xfloat_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50]
-; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [4:0.50]
+; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50]
+; SKX-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
   %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
   ret <16 x float> %res
 }
 define <16 x float> @test_masked_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xfloat_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -2261,13 +3555,21 @@ define <16 x float> @test_masked_16xfloa
 }
 
 define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
+; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -2276,13 +3578,21 @@ define <16 x float> @test_masked_z_16xfl
 }
 
 define <16 x float> @test_masked_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xfloat_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 4, i32 2, i32 3, i32 5, i32 11, i32 6, i32 4, i32 7, i32 6, i32 4, i32 14, i32 8, i32 15, i32 12, i32 9, i32 4>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -2291,13 +3601,21 @@ define <16 x float> @test_masked_16xfloa
 }
 
 define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
+; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 4, i32 2, i32 3, i32 5, i32 11, i32 6, i32 4, i32 7, i32 6, i32 4, i32 14, i32 8, i32 15, i32 12, i32 9, i32 4>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -2306,13 +3624,21 @@ define <16 x float> @test_masked_z_16xfl
 }
 
 define <16 x float> @test_masked_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xfloat_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 11, i32 6, i32 7, i32 0, i32 11, i32 0, i32 10, i32 9, i32 12, i32 4, i32 10, i32 3, i32 8, i32 5>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -2321,13 +3647,21 @@ define <16 x float> @test_masked_16xfloa
 }
 
 define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
+; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 11, i32 6, i32 7, i32 0, i32 11, i32 0, i32 10, i32 9, i32 12, i32 4, i32 10, i32 3, i32 8, i32 5>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -2336,23 +3670,37 @@ define <16 x float> @test_masked_z_16xfl
 }
 
 define <16 x float> @test_16xfloat_perm_mem_mask3(<16 x float>* %vp) {
-; CHECK-LABEL: test_16xfloat_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50]
-; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [4:0.50]
+; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50]
+; SKX-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
   %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
   ret <16 x float> %res
 }
 define <16 x float> @test_masked_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xfloat_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -2361,13 +3709,21 @@ define <16 x float> @test_masked_16xfloa
 }
 
 define <16 x float> @test_masked_z_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
+; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x float>, <16 x float>* %vp
   %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -2376,21 +3732,34 @@ define <16 x float> @test_masked_z_16xfl
 }
 
 define <4 x double> @test_4xdouble_perm_mask0(<4 x double> %vec) {
-; CHECK-LABEL: test_4xdouble_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,2] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xdouble_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,2] sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,2] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
   ret <4 x double> %res
 }
 define <4 x double> @test_masked_4xdouble_perm_mask0(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_4xdouble_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2] sched: [3:1.00]
-; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_4xdouble_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2]
+; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xdouble_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2] sched: [3:1.00]
+; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
@@ -2398,25 +3767,40 @@ define <4 x double> @test_masked_4xdoubl
 }
 
 define <4 x double> @test_masked_z_4xdouble_perm_mask0(<4 x double> %vec, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_4xdouble_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xdouble_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 define <4 x double> @test_masked_4xdouble_perm_mask1(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_4xdouble_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0] sched: [3:1.00]
-; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_4xdouble_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0]
+; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xdouble_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0] sched: [3:1.00]
+; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
@@ -2424,25 +3808,40 @@ define <4 x double> @test_masked_4xdoubl
 }
 
 define <4 x double> @test_masked_z_4xdouble_perm_mask1(<4 x double> %vec, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_4xdouble_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xdouble_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 define <4 x double> @test_masked_4xdouble_perm_mask2(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_4xdouble_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1] sched: [3:1.00]
-; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_4xdouble_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1]
+; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xdouble_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1] sched: [3:1.00]
+; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
@@ -2450,33 +3849,53 @@ define <4 x double> @test_masked_4xdoubl
 }
 
 define <4 x double> @test_masked_z_4xdouble_perm_mask2(<4 x double> %vec, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_4xdouble_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xdouble_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 define <4 x double> @test_4xdouble_perm_mask3(<4 x double> %vec) {
-; CHECK-LABEL: test_4xdouble_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,2] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xdouble_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,2] sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,2] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
   ret <4 x double> %res
 }
 define <4 x double> @test_masked_4xdouble_perm_mask3(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_4xdouble_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2] sched: [3:1.00]
-; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_4xdouble_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2]
+; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xdouble_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2] sched: [3:1.00]
+; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
@@ -2484,33 +3903,52 @@ define <4 x double> @test_masked_4xdoubl
 }
 
 define <4 x double> @test_masked_z_4xdouble_perm_mask3(<4 x double> %vec, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_4xdouble_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xdouble_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 define <4 x double> @test_4xdouble_perm_mem_mask0(<4 x double>* %vp) {
-; CHECK-LABEL: test_4xdouble_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = mem[0,0,2,0] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xdouble_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = mem[0,0,2,0] sched: [5:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = mem[0,0,2,0] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
   %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
   ret <4 x double> %res
 }
 define <4 x double> @test_masked_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xdouble_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2519,12 +3957,19 @@ define <4 x double> @test_masked_4xdoubl
 }
 
 define <4 x double> @test_masked_z_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2533,12 +3978,19 @@ define <4 x double> @test_masked_z_4xdou
 }
 
 define <4 x double> @test_masked_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xdouble_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 2>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2547,12 +3999,19 @@ define <4 x double> @test_masked_4xdoubl
 }
 
 define <4 x double> @test_masked_z_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 2>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2561,12 +4020,19 @@ define <4 x double> @test_masked_z_4xdou
 }
 
 define <4 x double> @test_masked_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xdouble_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2575,12 +4041,19 @@ define <4 x double> @test_masked_4xdoubl
 }
 
 define <4 x double> @test_masked_z_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2589,21 +4062,33 @@ define <4 x double> @test_masked_z_4xdou
 }
 
 define <4 x double> @test_4xdouble_perm_mem_mask3(<4 x double>* %vp) {
-; CHECK-LABEL: test_4xdouble_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = mem[3,2,3,2] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xdouble_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = mem[3,2,3,2] sched: [5:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = mem[3,2,3,2] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
   %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
   ret <4 x double> %res
 }
 define <4 x double> @test_masked_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xdouble_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2612,12 +4097,19 @@ define <4 x double> @test_masked_4xdoubl
 }
 
 define <4 x double> @test_masked_z_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x double>, <4 x double>* %vp
   %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2626,23 +4118,38 @@ define <4 x double> @test_masked_z_4xdou
 }
 
 define <8 x double> @test_8xdouble_perm_mask0(<8 x double> %vec) {
-; CHECK-LABEL: test_8xdouble_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] sched: [8:0.50]
-; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xdouble_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] sched: [4:0.50]
+; GENERIC-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] sched: [8:0.50]
+; SKX-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
   ret <8 x double> %res
 }
 define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xdouble_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xdouble_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1
+; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
@@ -2650,26 +4157,42 @@ define <8 x double> @test_masked_8xdoubl
 }
 
 define <8 x double> @test_masked_z_8xdouble_perm_mask0(<8 x double> %vec, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xdouble_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 define <8 x double> @test_masked_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00]
-; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_imm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
@@ -2677,26 +4200,42 @@ define <8 x double> @test_masked_8xdoubl
 }
 
 define <8 x double> @test_masked_z_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xdouble_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xdouble_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1
+; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
@@ -2704,34 +4243,55 @@ define <8 x double> @test_masked_8xdoubl
 }
 
 define <8 x double> @test_masked_z_8xdouble_perm_mask2(<8 x double> %vec, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xdouble_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 define <8 x double> @test_8xdouble_perm_imm_mask3(<8 x double> %vec) {
-; CHECK-LABEL: test_8xdouble_perm_imm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xdouble_perm_imm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[1,3,3,0,5,7,7,4]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_perm_imm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
   ret <8 x double> %res
 }
 define <8 x double> @test_masked_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
-; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_imm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
@@ -2739,26 +4299,42 @@ define <8 x double> @test_masked_8xdoubl
 }
 
 define <8 x double> @test_masked_z_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xdouble_perm_mask4:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xdouble_perm_mask4:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1
+; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_mask4:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
@@ -2766,26 +4342,42 @@ define <8 x double> @test_masked_8xdoubl
 }
 
 define <8 x double> @test_masked_z_8xdouble_perm_mask4(<8 x double> %vec, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xdouble_perm_mask4:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask4:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_mask4:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 define <8 x double> @test_masked_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask5:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00]
-; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask5:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_imm_mask5:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
@@ -2793,35 +4385,57 @@ define <8 x double> @test_masked_8xdoubl
 }
 
 define <8 x double> @test_masked_z_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask5:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask5:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask5:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 define <8 x double> @test_8xdouble_perm_mask6(<8 x double> %vec) {
-; CHECK-LABEL: test_8xdouble_perm_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] sched: [8:0.50]
-; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xdouble_perm_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] sched: [4:0.50]
+; GENERIC-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_perm_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] sched: [8:0.50]
+; SKX-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
   ret <8 x double> %res
 }
 define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xdouble_perm_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xdouble_perm_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1
+; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1}
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
@@ -2829,26 +4443,42 @@ define <8 x double> @test_masked_8xdoubl
 }
 
 define <8 x double> @test_masked_z_8xdouble_perm_mask6(<8 x double> %vec, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xdouble_perm_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 define <8 x double> @test_masked_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask7:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00]
-; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask7:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_imm_mask7:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
@@ -2856,35 +4486,56 @@ define <8 x double> @test_masked_8xdoubl
 }
 
 define <8 x double> @test_masked_z_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask7:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask7:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask7:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 define <8 x double> @test_8xdouble_perm_mem_mask0(<8 x double>* %vp) {
-; CHECK-LABEL: test_8xdouble_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [8:0.50]
-; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xdouble_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [4:0.50]
+; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [8:0.50]
+; SKX-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
   ret <8 x double> %res
 }
 define <8 x double> @test_masked_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -2893,13 +4544,21 @@ define <8 x double> @test_masked_8xdoubl
 }
 
 define <8 x double> @test_masked_z_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
+; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -2908,12 +4567,19 @@ define <8 x double> @test_masked_z_8xdou
 }
 
 define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 2, i32 0, i32 3, i32 4, i32 6, i32 4, i32 7>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -2922,12 +4588,19 @@ define <8 x double> @test_masked_8xdoubl
 }
 
 define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 2, i32 0, i32 3, i32 4, i32 6, i32 4, i32 7>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -2936,13 +4609,21 @@ define <8 x double> @test_masked_z_8xdou
 }
 
 define <8 x double> @test_masked_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 6, i32 7, i32 2, i32 7, i32 7, i32 6, i32 2, i32 5>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -2951,13 +4632,21 @@ define <8 x double> @test_masked_8xdoubl
 }
 
 define <8 x double> @test_masked_z_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
+; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 6, i32 7, i32 2, i32 7, i32 7, i32 6, i32 2, i32 5>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -2966,21 +4655,33 @@ define <8 x double> @test_masked_z_8xdou
 }
 
 define <8 x double> @test_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp) {
-; CHECK-LABEL: test_8xdouble_perm_imm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xdouble_perm_imm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_perm_imm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
   ret <8 x double> %res
 }
 define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -2989,12 +4690,19 @@ define <8 x double> @test_masked_8xdoubl
 }
 
 define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -3003,13 +4711,21 @@ define <8 x double> @test_masked_z_8xdou
 }
 
 define <8 x double> @test_masked_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask4:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask4:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_mem_mask4:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 5, i32 6, i32 0, i32 6, i32 0>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -3018,13 +4734,21 @@ define <8 x double> @test_masked_8xdoubl
 }
 
 define <8 x double> @test_masked_z_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask4:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask4:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
+; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask4:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 5, i32 6, i32 0, i32 6, i32 0>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -3033,12 +4757,19 @@ define <8 x double> @test_masked_z_8xdou
 }
 
 define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask5:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask5:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask5:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 3, i32 6, i32 6, i32 6, i32 7>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -3047,12 +4778,19 @@ define <8 x double> @test_masked_8xdoubl
 }
 
 define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 3, i32 6, i32 6, i32 6, i32 7>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -3061,23 +4799,37 @@ define <8 x double> @test_masked_z_8xdou
 }
 
 define <8 x double> @test_8xdouble_perm_mem_mask6(<8 x double>* %vp) {
-; CHECK-LABEL: test_8xdouble_perm_mem_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [8:0.50]
-; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xdouble_perm_mem_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [4:0.50]
+; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_perm_mem_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [8:0.50]
+; SKX-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
   ret <8 x double> %res
 }
 define <8 x double> @test_masked_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_mem_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -3086,13 +4838,21 @@ define <8 x double> @test_masked_8xdoubl
 }
 
 define <8 x double> @test_masked_z_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
+; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -3101,12 +4861,19 @@ define <8 x double> @test_masked_z_8xdou
 }
 
 define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask7:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask7:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask7:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -3115,12 +4882,19 @@ define <8 x double> @test_masked_8xdoubl
 }
 
 define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
+; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x double>, <8 x double>* %vp
   %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -3129,21 +4903,34 @@ define <8 x double> @test_masked_z_8xdou
 }
 
 define <16 x i8> @test_16xi8_perm_mask0(<16 x i8> %vec) {
-; CHECK-LABEL: test_16xi8_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi8_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi8_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
   ret <16 x i8> %res
 }
 define <16 x i8> @test_masked_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
-; CHECK-LABEL: test_masked_16xi8_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi8_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
+; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi8_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
@@ -3151,25 +4938,40 @@ define <16 x i8> @test_masked_16xi8_perm
 }
 
 define <16 x i8> @test_masked_z_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %mask) {
-; CHECK-LABEL: test_masked_z_16xi8_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi8_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi8_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
   ret <16 x i8> %res
 }
 define <16 x i8> @test_masked_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
-; CHECK-LABEL: test_masked_16xi8_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00]
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi8_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
+; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi8_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
@@ -3177,25 +4979,40 @@ define <16 x i8> @test_masked_16xi8_perm
 }
 
 define <16 x i8> @test_masked_z_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %mask) {
-; CHECK-LABEL: test_masked_z_16xi8_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi8_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi8_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
   ret <16 x i8> %res
 }
 define <16 x i8> @test_masked_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
-; CHECK-LABEL: test_masked_16xi8_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00]
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi8_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
+; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi8_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
@@ -3203,33 +5020,53 @@ define <16 x i8> @test_masked_16xi8_perm
 }
 
 define <16 x i8> @test_masked_z_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %mask) {
-; CHECK-LABEL: test_masked_z_16xi8_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi8_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi8_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
   ret <16 x i8> %res
 }
 define <16 x i8> @test_16xi8_perm_mask3(<16 x i8> %vec) {
-; CHECK-LABEL: test_16xi8_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi8_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi8_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
   ret <16 x i8> %res
 }
 define <16 x i8> @test_masked_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
-; CHECK-LABEL: test_masked_16xi8_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi8_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
+; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi8_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
@@ -3237,35 +5074,56 @@ define <16 x i8> @test_masked_16xi8_perm
 }
 
 define <16 x i8> @test_masked_z_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %mask) {
-; CHECK-LABEL: test_masked_z_16xi8_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi8_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi8_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
   ret <16 x i8> %res
 }
 define <16 x i8> @test_16xi8_perm_mem_mask0(<16 x i8>* %vp) {
-; CHECK-LABEL: test_16xi8_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi8_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi8_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
+; SKX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
   %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
   ret <16 x i8> %res
 }
 define <16 x i8> @test_masked_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
-; CHECK-LABEL: test_masked_16xi8_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %xmm3, %xmm1, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi8_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
@@ -3274,13 +5132,21 @@ define <16 x i8> @test_masked_16xi8_perm
 }
 
 define <16 x i8> @test_masked_z_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %mask) {
-; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %xmm2, %xmm0, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
@@ -3289,13 +5155,21 @@ define <16 x i8> @test_masked_z_16xi8_pe
 }
 
 define <16 x i8> @test_masked_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
-; CHECK-LABEL: test_masked_16xi8_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %xmm3, %xmm1, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi8_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
@@ -3304,13 +5178,21 @@ define <16 x i8> @test_masked_16xi8_perm
 }
 
 define <16 x i8> @test_masked_z_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %mask) {
-; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %xmm2, %xmm0, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
@@ -3319,13 +5201,21 @@ define <16 x i8> @test_masked_z_16xi8_pe
 }
 
 define <16 x i8> @test_masked_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
-; CHECK-LABEL: test_masked_16xi8_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %xmm3, %xmm1, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi8_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
@@ -3334,13 +5224,21 @@ define <16 x i8> @test_masked_16xi8_perm
 }
 
 define <16 x i8> @test_masked_z_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %mask) {
-; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %xmm2, %xmm0, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
@@ -3349,23 +5247,37 @@ define <16 x i8> @test_masked_z_16xi8_pe
 }
 
 define <16 x i8> @test_16xi8_perm_mem_mask3(<16 x i8>* %vp) {
-; CHECK-LABEL: test_16xi8_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi8_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi8_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
+; SKX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
   %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
   ret <16 x i8> %res
 }
 define <16 x i8> @test_masked_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
-; CHECK-LABEL: test_masked_16xi8_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %xmm3, %xmm1, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi8_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
@@ -3374,13 +5286,21 @@ define <16 x i8> @test_masked_16xi8_perm
 }
 
 define <16 x i8> @test_masked_z_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %mask) {
-; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %xmm2, %xmm0, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i8>, <16 x i8>* %vp
   %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
   %cmp = icmp eq <16 x i8> %mask, zeroinitializer
@@ -3389,21 +5309,34 @@ define <16 x i8> @test_masked_z_16xi8_pe
 }
 
 define <32 x i8> @test_32xi8_perm_mask0(<32 x i8> %vec) {
-; CHECK-LABEL: test_32xi8_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_32xi8_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [5:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi8_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
   ret <32 x i8> %res
 }
 define <32 x i8> @test_masked_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
-; CHECK-LABEL: test_masked_32xi8_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi8_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21]
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi8_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
@@ -3411,25 +5344,40 @@ define <32 x i8> @test_masked_32xi8_perm
 }
 
 define <32 x i8> @test_masked_z_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %mask) {
-; CHECK-LABEL: test_masked_z_32xi8_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi8_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi8_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
   ret <32 x i8> %res
 }
 define <32 x i8> @test_masked_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
-; CHECK-LABEL: test_masked_32xi8_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi8_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24]
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi8_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24>
   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
@@ -3437,25 +5385,40 @@ define <32 x i8> @test_masked_32xi8_perm
 }
 
 define <32 x i8> @test_masked_z_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %mask) {
-; CHECK-LABEL: test_masked_z_32xi8_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi8_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi8_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24>
   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
   ret <32 x i8> %res
 }
 define <32 x i8> @test_masked_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
-; CHECK-LABEL: test_masked_32xi8_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi8_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29]
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi8_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29>
   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
@@ -3463,33 +5426,53 @@ define <32 x i8> @test_masked_32xi8_perm
 }
 
 define <32 x i8> @test_masked_z_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %mask) {
-; CHECK-LABEL: test_masked_z_32xi8_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi8_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi8_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29>
   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
   ret <32 x i8> %res
 }
 define <32 x i8> @test_32xi8_perm_mask3(<32 x i8> %vec) {
-; CHECK-LABEL: test_32xi8_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_32xi8_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [5:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi8_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
   ret <32 x i8> %res
 }
 define <32 x i8> @test_masked_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
-; CHECK-LABEL: test_masked_32xi8_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi8_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18]
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi8_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
@@ -3497,35 +5480,56 @@ define <32 x i8> @test_masked_32xi8_perm
 }
 
 define <32 x i8> @test_masked_z_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %mask) {
-; CHECK-LABEL: test_masked_z_32xi8_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi8_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi8_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
   ret <32 x i8> %res
 }
 define <32 x i8> @test_32xi8_perm_mem_mask0(<32 x i8>* %vp) {
-; CHECK-LABEL: test_32xi8_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_32xi8_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [5:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi8_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
+; SKX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
   %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
   ret <32 x i8> %res
 }
 define <32 x i8> @test_masked_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
-; CHECK-LABEL: test_masked_32xi8_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %ymm3, %ymm1, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi8_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
@@ -3534,13 +5538,21 @@ define <32 x i8> @test_masked_32xi8_perm
 }
 
 define <32 x i8> @test_masked_z_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %mask) {
-; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %ymm2, %ymm0, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
@@ -3549,13 +5561,21 @@ define <32 x i8> @test_masked_z_32xi8_pe
 }
 
 define <32 x i8> @test_masked_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
-; CHECK-LABEL: test_masked_32xi8_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %ymm3, %ymm1, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi8_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 15, i32 10, i32 1, i32 1, i32 11, i32 0, i32 0, i32 6, i32 8, i32 7, i32 7, i32 9, i32 10, i32 6, i32 5, i32 15, i32 20, i32 28, i32 22, i32 21, i32 17, i32 29, i32 27, i32 30, i32 23, i32 26, i32 17, i32 22, i32 19, i32 16, i32 31, i32 19>
   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
@@ -3564,13 +5584,21 @@ define <32 x i8> @test_masked_32xi8_perm
 }
 
 define <32 x i8> @test_masked_z_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %mask) {
-; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %ymm2, %ymm0, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 15, i32 10, i32 1, i32 1, i32 11, i32 0, i32 0, i32 6, i32 8, i32 7, i32 7, i32 9, i32 10, i32 6, i32 5, i32 15, i32 20, i32 28, i32 22, i32 21, i32 17, i32 29, i32 27, i32 30, i32 23, i32 26, i32 17, i32 22, i32 19, i32 16, i32 31, i32 19>
   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
@@ -3579,13 +5607,21 @@ define <32 x i8> @test_masked_z_32xi8_pe
 }
 
 define <32 x i8> @test_masked_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
-; CHECK-LABEL: test_masked_32xi8_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %ymm3, %ymm1, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi8_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 6, i32 8, i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7, i32 14, i32 5, i32 7, i32 7, i32 26, i32 19, i32 25, i32 19, i32 21, i32 31, i32 30, i32 29, i32 16, i32 18, i32 20, i32 28, i32 29, i32 25, i32 27, i32 28>
   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
@@ -3594,13 +5630,21 @@ define <32 x i8> @test_masked_32xi8_perm
 }
 
 define <32 x i8> @test_masked_z_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %mask) {
-; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %ymm2, %ymm0, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 6, i32 8, i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7, i32 14, i32 5, i32 7, i32 7, i32 26, i32 19, i32 25, i32 19, i32 21, i32 31, i32 30, i32 29, i32 16, i32 18, i32 20, i32 28, i32 29, i32 25, i32 27, i32 28>
   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
@@ -3609,23 +5653,37 @@ define <32 x i8> @test_masked_z_32xi8_pe
 }
 
 define <32 x i8> @test_32xi8_perm_mem_mask3(<32 x i8>* %vp) {
-; CHECK-LABEL: test_32xi8_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_32xi8_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [5:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi8_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
+; SKX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
   %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
   ret <32 x i8> %res
 }
 define <32 x i8> @test_masked_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
-; CHECK-LABEL: test_masked_32xi8_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %ymm3, %ymm1, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi8_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
@@ -3634,13 +5692,21 @@ define <32 x i8> @test_masked_32xi8_perm
 }
 
 define <32 x i8> @test_masked_z_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %mask) {
-; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %ymm2, %ymm0, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i8>, <32 x i8>* %vp
   %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
   %cmp = icmp eq <32 x i8> %mask, zeroinitializer
@@ -3649,21 +5715,34 @@ define <32 x i8> @test_masked_z_32xi8_pe
 }
 
 define <64 x i8> @test_64xi8_perm_mask0(<64 x i8> %vec) {
-; CHECK-LABEL: test_64xi8_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_64xi8_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_64xi8_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
   ret <64 x i8> %res
 }
 define <64 x i8> @test_masked_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
-; CHECK-LABEL: test_masked_64xi8_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_64xi8_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_64xi8_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
@@ -3671,25 +5750,40 @@ define <64 x i8> @test_masked_64xi8_perm
 }
 
 define <64 x i8> @test_masked_z_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %mask) {
-; CHECK-LABEL: test_masked_z_64xi8_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_64xi8_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_64xi8_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
   ret <64 x i8> %res
 }
 define <64 x i8> @test_masked_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
-; CHECK-LABEL: test_masked_64xi8_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_64xi8_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_64xi8_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
@@ -3697,25 +5791,40 @@ define <64 x i8> @test_masked_64xi8_perm
 }
 
 define <64 x i8> @test_masked_z_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %mask) {
-; CHECK-LABEL: test_masked_z_64xi8_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_64xi8_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_64xi8_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
   ret <64 x i8> %res
 }
 define <64 x i8> @test_masked_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
-; CHECK-LABEL: test_masked_64xi8_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_64xi8_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_64xi8_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
@@ -3723,33 +5832,53 @@ define <64 x i8> @test_masked_64xi8_perm
 }
 
 define <64 x i8> @test_masked_z_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %mask) {
-; CHECK-LABEL: test_masked_z_64xi8_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_64xi8_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_64xi8_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
   ret <64 x i8> %res
 }
 define <64 x i8> @test_64xi8_perm_mask3(<64 x i8> %vec) {
-; CHECK-LABEL: test_64xi8_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_64xi8_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_64xi8_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
   ret <64 x i8> %res
 }
 define <64 x i8> @test_masked_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
-; CHECK-LABEL: test_masked_64xi8_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_64xi8_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_64xi8_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
@@ -3757,35 +5886,56 @@ define <64 x i8> @test_masked_64xi8_perm
 }
 
 define <64 x i8> @test_masked_z_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %mask) {
-; CHECK-LABEL: test_masked_z_64xi8_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_64xi8_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_64xi8_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
   %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
   ret <64 x i8> %res
 }
 define <64 x i8> @test_64xi8_perm_mem_mask0(<64 x i8>* %vp) {
-; CHECK-LABEL: test_64xi8_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [8:0.50]
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_64xi8_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [4:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_64xi8_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [8:0.50]
+; SKX-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
   %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
   ret <64 x i8> %res
 }
 define <64 x i8> @test_masked_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
-; CHECK-LABEL: test_masked_64xi8_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_64xi8_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
@@ -3794,13 +5944,21 @@ define <64 x i8> @test_masked_64xi8_perm
 }
 
 define <64 x i8> @test_masked_z_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %mask) {
-; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %zmm2, %zmm0, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
@@ -3809,13 +5967,21 @@ define <64 x i8> @test_masked_z_64xi8_pe
 }
 
 define <64 x i8> @test_masked_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
-; CHECK-LABEL: test_masked_64xi8_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_64xi8_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 15, i32 6, i32 14, i32 7, i32 5, i32 1, i32 14, i32 12, i32 5, i32 7, i32 5, i32 0, i32 0, i32 5, i32 3, i32 8, i32 19, i32 19, i32 26, i32 27, i32 20, i32 29, i32 20, i32 21, i32 27, i32 16, i32 30, i32 17, i32 23, i32 27, i32 16, i32 28, i32 47, i32 39, i32 33, i32 33, i32 33, i32 44, i32 38, i32 46, i32 39, i32 33, i32 38, i32 44, i32 45, i32 32, i32 34, i32 39, i32 50, i32 61, i32 62, i32 53, i32 54, i32 56, i32 52, i32 56, i32 51, i32 52, i32 55, i32 57, i32 56, i32 52, i32 51, i32 49>
   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
@@ -3824,13 +5990,21 @@ define <64 x i8> @test_masked_64xi8_perm
 }
 
 define <64 x i8> @test_masked_z_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %mask) {
-; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %zmm2, %zmm0, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 15, i32 6, i32 14, i32 7, i32 5, i32 1, i32 14, i32 12, i32 5, i32 7, i32 5, i32 0, i32 0, i32 5, i32 3, i32 8, i32 19, i32 19, i32 26, i32 27, i32 20, i32 29, i32 20, i32 21, i32 27, i32 16, i32 30, i32 17, i32 23, i32 27, i32 16, i32 28, i32 47, i32 39, i32 33, i32 33, i32 33, i32 44, i32 38, i32 46, i32 39, i32 33, i32 38, i32 44, i32 45, i32 32, i32 34, i32 39, i32 50, i32 61, i32 62, i32 53, i32 54, i32 56, i32 52, i32 56, i32 51, i32 52, i32 55, i32 57, i32 56, i32 52, i32 51, i32 49>
   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
@@ -3839,13 +6013,21 @@ define <64 x i8> @test_masked_z_64xi8_pe
 }
 
 define <64 x i8> @test_masked_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
-; CHECK-LABEL: test_masked_64xi8_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_64xi8_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 12, i32 1, i32 11, i32 3, i32 4, i32 11, i32 10, i32 11, i32 8, i32 13, i32 1, i32 10, i32 1, i32 11, i32 5, i32 10, i32 27, i32 26, i32 19, i32 29, i32 19, i32 24, i32 26, i32 19, i32 26, i32 20, i32 18, i32 28, i32 24, i32 21, i32 25, i32 16, i32 34, i32 38, i32 47, i32 40, i32 33, i32 44, i32 44, i32 44, i32 41, i32 43, i32 35, i32 43, i32 45, i32 44, i32 37, i32 41, i32 58, i32 62, i32 49, i32 61, i32 56, i32 53, i32 55, i32 48, i32 51, i32 58, i32 58, i32 55, i32 63, i32 55, i32 53, i32 61>
   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
@@ -3854,13 +6036,21 @@ define <64 x i8> @test_masked_64xi8_perm
 }
 
 define <64 x i8> @test_masked_z_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %mask) {
-; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %zmm2, %zmm0, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 12, i32 1, i32 11, i32 3, i32 4, i32 11, i32 10, i32 11, i32 8, i32 13, i32 1, i32 10, i32 1, i32 11, i32 5, i32 10, i32 27, i32 26, i32 19, i32 29, i32 19, i32 24, i32 26, i32 19, i32 26, i32 20, i32 18, i32 28, i32 24, i32 21, i32 25, i32 16, i32 34, i32 38, i32 47, i32 40, i32 33, i32 44, i32 44, i32 44, i32 41, i32 43, i32 35, i32 43, i32 45, i32 44, i32 37, i32 41, i32 58, i32 62, i32 49, i32 61, i32 56, i32 53, i32 55, i32 48, i32 51, i32 58, i32 58, i32 55, i32 63, i32 55, i32 53, i32 61>
   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
@@ -3869,23 +6059,37 @@ define <64 x i8> @test_masked_z_64xi8_pe
 }
 
 define <64 x i8> @test_64xi8_perm_mem_mask3(<64 x i8>* %vp) {
-; CHECK-LABEL: test_64xi8_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [8:0.50]
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_64xi8_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [4:0.50]
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_64xi8_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [8:0.50]
+; SKX-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
   %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
   ret <64 x i8> %res
 }
 define <64 x i8> @test_masked_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
-; CHECK-LABEL: test_masked_64xi8_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_64xi8_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
@@ -3894,13 +6098,21 @@ define <64 x i8> @test_masked_64xi8_perm
 }
 
 define <64 x i8> @test_masked_z_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %mask) {
-; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqb %zmm2, %zmm0, %k1
+; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <64 x i8>, <64 x i8>* %vp
   %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
   %cmp = icmp eq <64 x i8> %mask, zeroinitializer
@@ -3909,21 +6121,34 @@ define <64 x i8> @test_masked_z_64xi8_pe
 }
 
 define <8 x i16> @test_8xi16_perm_high_mask0(<8 x i16> %vec) {
-; CHECK-LABEL: test_8xi16_perm_high_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi16_perm_high_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,6] sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi16_perm_high_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
   ret <8 x i16> %res
 }
 define <8 x i16> @test_masked_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_8xi16_perm_high_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi16_perm_high_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6]
+; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_high_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
@@ -3931,25 +6156,40 @@ define <8 x i16> @test_masked_8xi16_perm
 }
 
 define <8 x i16> @test_masked_z_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_high_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
 }
 define <8 x i16> @test_masked_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_8xi16_perm_low_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi16_perm_low_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7]
+; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_low_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
@@ -3957,25 +6197,40 @@ define <8 x i16> @test_masked_8xi16_perm
 }
 
 define <8 x i16> @test_masked_z_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_low_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
 }
 define <8 x i16> @test_masked_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_8xi16_perm_high_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi16_perm_high_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5]
+; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_high_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
@@ -3983,33 +6238,53 @@ define <8 x i16> @test_masked_8xi16_perm
 }
 
 define <8 x i16> @test_masked_z_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_high_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
 }
 define <8 x i16> @test_8xi16_perm_low_mask3(<8 x i16> %vec) {
-; CHECK-LABEL: test_8xi16_perm_low_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi16_perm_low_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,1,1,4,5,6,7] sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi16_perm_low_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
 }
 define <8 x i16> @test_masked_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_8xi16_perm_low_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi16_perm_low_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7]
+; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_low_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
@@ -4017,25 +6292,40 @@ define <8 x i16> @test_masked_8xi16_perm
 }
 
 define <8 x i16> @test_masked_z_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_low_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
 }
 define <8 x i16> @test_masked_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_8xi16_perm_high_mask4:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi16_perm_high_mask4:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6]
+; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_high_mask4:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
@@ -4043,25 +6333,40 @@ define <8 x i16> @test_masked_8xi16_perm
 }
 
 define <8 x i16> @test_masked_z_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask4:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask4:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_high_mask4:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
 }
 define <8 x i16> @test_masked_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_8xi16_perm_low_mask5:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi16_perm_low_mask5:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7]
+; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_low_mask5:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
@@ -4069,33 +6374,53 @@ define <8 x i16> @test_masked_8xi16_perm
 }
 
 define <8 x i16> @test_masked_z_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask5:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask5:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_low_mask5:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
 }
 define <8 x i16> @test_8xi16_perm_high_mask6(<8 x i16> %vec) {
-; CHECK-LABEL: test_8xi16_perm_high_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi16_perm_high_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,5] sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi16_perm_high_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
   ret <8 x i16> %res
 }
 define <8 x i16> @test_masked_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_8xi16_perm_high_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi16_perm_high_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5]
+; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_high_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
@@ -4103,25 +6428,40 @@ define <8 x i16> @test_masked_8xi16_perm
 }
 
 define <8 x i16> @test_masked_z_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_high_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
 }
 define <8 x i16> @test_masked_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_8xi16_perm_low_mask7:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi16_perm_low_mask7:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7]
+; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_low_mask7:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
@@ -4129,33 +6469,52 @@ define <8 x i16> @test_masked_8xi16_perm
 }
 
 define <8 x i16> @test_masked_z_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask7:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask7:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_low_mask7:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
   ret <8 x i16> %res
 }
 define <8 x i16> @test_8xi16_perm_high_mem_mask0(<8 x i16>* %vp) {
-; CHECK-LABEL: test_8xi16_perm_high_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,7,4,6] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi16_perm_high_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,7,4,6] sched: [7:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi16_perm_high_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,7,4,6] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
   ret <8 x i16> %res
 }
 define <8 x i16> @test_masked_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
@@ -4164,12 +6523,19 @@ define <8 x i16> @test_masked_8xi16_perm
 }
 
 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
@@ -4178,12 +6544,19 @@ define <8 x i16> @test_masked_z_8xi16_pe
 }
 
 define <8 x i16> @test_masked_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
@@ -4192,12 +6565,19 @@ define <8 x i16> @test_masked_8xi16_perm
 }
 
 define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
@@ -4206,12 +6586,19 @@ define <8 x i16> @test_masked_z_8xi16_pe
 }
 
 define <8 x i16> @test_masked_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 6, i32 5, i32 7>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
@@ -4220,12 +6607,19 @@ define <8 x i16> @test_masked_8xi16_perm
 }
 
 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 6, i32 5, i32 7>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
@@ -4234,21 +6628,33 @@ define <8 x i16> @test_masked_z_8xi16_pe
 }
 
 define <8 x i16> @test_8xi16_perm_low_mem_mask3(<8 x i16>* %vp) {
-; CHECK-LABEL: test_8xi16_perm_low_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 = mem[3,1,2,0,4,5,6,7] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi16_perm_low_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 = mem[3,1,2,0,4,5,6,7] sched: [7:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi16_perm_low_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 = mem[3,1,2,0,4,5,6,7] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %res
 }
 define <8 x i16> @test_masked_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
@@ -4257,12 +6663,19 @@ define <8 x i16> @test_masked_8xi16_perm
 }
 
 define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
@@ -4271,12 +6684,19 @@ define <8 x i16> @test_masked_z_8xi16_pe
 }
 
 define <8 x i16> @test_masked_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask4:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask4:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask4:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 7, i32 5>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
@@ -4285,12 +6705,19 @@ define <8 x i16> @test_masked_8xi16_perm
 }
 
 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask4:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask4:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask4:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 7, i32 5>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
@@ -4299,12 +6726,19 @@ define <8 x i16> @test_masked_z_8xi16_pe
 }
 
 define <8 x i16> @test_masked_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask5:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask5:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask5:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
@@ -4313,12 +6747,19 @@ define <8 x i16> @test_masked_8xi16_perm
 }
 
 define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask5:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask5:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask5:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
@@ -4327,21 +6768,33 @@ define <8 x i16> @test_masked_z_8xi16_pe
 }
 
 define <8 x i16> @test_8xi16_perm_high_mem_mask6(<8 x i16>* %vp) {
-; CHECK-LABEL: test_8xi16_perm_high_mem_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,4,4,4] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi16_perm_high_mem_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,4,4,4] sched: [7:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi16_perm_high_mem_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,4,4,4] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
   ret <8 x i16> %res
 }
 define <8 x i16> @test_masked_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
@@ -4350,12 +6803,19 @@ define <8 x i16> @test_masked_8xi16_perm
 }
 
 define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
@@ -4364,12 +6824,19 @@ define <8 x i16> @test_masked_z_8xi16_pe
 }
 
 define <8 x i16> @test_masked_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask7:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask7:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask7:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 1, i32 4, i32 5, i32 6, i32 7>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
@@ -4378,12 +6845,19 @@ define <8 x i16> @test_masked_8xi16_perm
 }
 
 define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask7:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask7:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask7:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i16>, <8 x i16>* %vp
   %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 1, i32 4, i32 5, i32 6, i32 7>
   %cmp = icmp eq <8 x i16> %mask, zeroinitializer
@@ -4392,21 +6866,34 @@ define <8 x i16> @test_masked_z_8xi16_pe
 }
 
 define <16 x i16> @test_16xi16_perm_high_mask0(<16 x i16> %vec) {
-; CHECK-LABEL: test_16xi16_perm_high_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi16_perm_high_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi16_perm_high_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
   ret <16 x i16> %res
 }
 define <16 x i16> @test_masked_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_16xi16_perm_high_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi16_perm_high_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12]
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_high_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
@@ -4414,25 +6901,40 @@ define <16 x i16> @test_masked_16xi16_pe
 }
 
 define <16 x i16> @test_masked_z_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_high_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
 define <16 x i16> @test_masked_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_16xi16_perm_low_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi16_perm_low_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15]
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_low_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
@@ -4440,25 +6942,40 @@ define <16 x i16> @test_masked_16xi16_pe
 }
 
 define <16 x i16> @test_masked_z_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_low_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
 define <16 x i16> @test_masked_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_16xi16_perm_high_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi16_perm_high_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13]
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_high_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
@@ -4466,33 +6983,53 @@ define <16 x i16> @test_masked_16xi16_pe
 }
 
 define <16 x i16> @test_masked_z_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_high_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
 define <16 x i16> @test_16xi16_perm_low_mask3(<16 x i16> %vec) {
-; CHECK-LABEL: test_16xi16_perm_low_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi16_perm_low_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi16_perm_low_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i16> %res
 }
 define <16 x i16> @test_masked_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_16xi16_perm_low_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi16_perm_low_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15]
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_low_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
@@ -4500,25 +7037,40 @@ define <16 x i16> @test_masked_16xi16_pe
 }
 
 define <16 x i16> @test_masked_z_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_low_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
 define <16 x i16> @test_masked_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_16xi16_perm_high_mask4:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi16_perm_high_mask4:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15]
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_high_mask4:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
@@ -4526,25 +7078,40 @@ define <16 x i16> @test_masked_16xi16_pe
 }
 
 define <16 x i16> @test_masked_z_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask4:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask4:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_high_mask4:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
 define <16 x i16> @test_masked_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_16xi16_perm_low_mask5:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi16_perm_low_mask5:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15]
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_low_mask5:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
@@ -4552,33 +7119,53 @@ define <16 x i16> @test_masked_16xi16_pe
 }
 
 define <16 x i16> @test_masked_z_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask5:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask5:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_low_mask5:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
 define <16 x i16> @test_16xi16_perm_high_mask6(<16 x i16> %vec) {
-; CHECK-LABEL: test_16xi16_perm_high_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi16_perm_high_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi16_perm_high_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
   ret <16 x i16> %res
 }
 define <16 x i16> @test_masked_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_16xi16_perm_high_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi16_perm_high_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13]
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_high_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
@@ -4586,25 +7173,40 @@ define <16 x i16> @test_masked_16xi16_pe
 }
 
 define <16 x i16> @test_masked_z_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_high_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
 define <16 x i16> @test_masked_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_16xi16_perm_low_mask7:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi16_perm_low_mask7:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15]
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_low_mask7:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
@@ -4612,33 +7214,52 @@ define <16 x i16> @test_masked_16xi16_pe
 }
 
 define <16 x i16> @test_masked_z_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask7:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask7:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_low_mask7:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
   ret <16 x i16> %res
 }
 define <16 x i16> @test_16xi16_perm_high_mem_mask0(<16 x i16>* %vp) {
-; CHECK-LABEL: test_16xi16_perm_high_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi16_perm_high_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [5:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi16_perm_high_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
   ret <16 x i16> %res
 }
 define <16 x i16> @test_masked_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -4647,12 +7268,19 @@ define <16 x i16> @test_masked_16xi16_pe
 }
 
 define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -4661,12 +7289,19 @@ define <16 x i16> @test_masked_z_16xi16_
 }
 
 define <16 x i16> @test_masked_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -4675,12 +7310,19 @@ define <16 x i16> @test_masked_16xi16_pe
 }
 
 define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -4689,12 +7331,19 @@ define <16 x i16> @test_masked_z_16xi16_
 }
 
 define <16 x i16> @test_masked_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 13, i32 14>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -4703,12 +7352,19 @@ define <16 x i16> @test_masked_16xi16_pe
 }
 
 define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 13, i32 14>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -4717,21 +7373,33 @@ define <16 x i16> @test_masked_z_16xi16_
 }
 
 define <16 x i16> @test_16xi16_perm_low_mem_mask3(<16 x i16>* %vp) {
-; CHECK-LABEL: test_16xi16_perm_low_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi16_perm_low_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [5:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi16_perm_low_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i16> %res
 }
 define <16 x i16> @test_masked_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -4740,12 +7408,19 @@ define <16 x i16> @test_masked_16xi16_pe
 }
 
 define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -4754,12 +7429,19 @@ define <16 x i16> @test_masked_z_16xi16_
 }
 
 define <16 x i16> @test_masked_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask4:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask4:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask4:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 14, i32 15>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -4768,12 +7450,19 @@ define <16 x i16> @test_masked_16xi16_pe
 }
 
 define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask4:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask4:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask4:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 14, i32 15>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -4782,12 +7471,19 @@ define <16 x i16> @test_masked_z_16xi16_
 }
 
 define <16 x i16> @test_masked_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask5:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask5:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask5:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -4796,12 +7492,19 @@ define <16 x i16> @test_masked_16xi16_pe
 }
 
 define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask5:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask5:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask5:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -4810,21 +7513,33 @@ define <16 x i16> @test_masked_z_16xi16_
 }
 
 define <16 x i16> @test_16xi16_perm_high_mem_mask6(<16 x i16>* %vp) {
-; CHECK-LABEL: test_16xi16_perm_high_mem_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi16_perm_high_mem_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [5:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi16_perm_high_mem_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
   ret <16 x i16> %res
 }
 define <16 x i16> @test_masked_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -4833,12 +7548,19 @@ define <16 x i16> @test_masked_16xi16_pe
 }
 
 define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -4847,12 +7569,19 @@ define <16 x i16> @test_masked_z_16xi16_
 }
 
 define <16 x i16> @test_masked_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask7:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask7:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask7:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -4861,12 +7590,19 @@ define <16 x i16> @test_masked_16xi16_pe
 }
 
 define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask7:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask7:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask7:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i16>, <16 x i16>* %vp
   %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <16 x i16> %mask, zeroinitializer
@@ -4875,21 +7611,34 @@ define <16 x i16> @test_masked_z_16xi16_
 }
 
 define <32 x i16> @test_32xi16_perm_high_mask0(<32 x i16> %vec) {
-; CHECK-LABEL: test_32xi16_perm_high_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_32xi16_perm_high_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi16_perm_high_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
   ret <32 x i16> %res
 }
 define <32 x i16> @test_masked_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_32xi16_perm_high_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi16_perm_high_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_high_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
@@ -4897,25 +7646,40 @@ define <32 x i16> @test_masked_32xi16_pe
 }
 
 define <32 x i16> @test_masked_z_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_high_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
 }
 define <32 x i16> @test_masked_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_32xi16_perm_low_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi16_perm_low_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_low_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
@@ -4923,25 +7687,40 @@ define <32 x i16> @test_masked_32xi16_pe
 }
 
 define <32 x i16> @test_masked_z_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_low_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
 }
 define <32 x i16> @test_masked_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_32xi16_perm_high_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi16_perm_high_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_high_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
@@ -4949,33 +7728,53 @@ define <32 x i16> @test_masked_32xi16_pe
 }
 
 define <32 x i16> @test_masked_z_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_high_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
 }
 define <32 x i16> @test_32xi16_perm_low_mask3(<32 x i16> %vec) {
-; CHECK-LABEL: test_32xi16_perm_low_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_32xi16_perm_low_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi16_perm_low_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
   ret <32 x i16> %res
 }
 define <32 x i16> @test_masked_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_32xi16_perm_low_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi16_perm_low_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_low_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
@@ -4983,25 +7782,40 @@ define <32 x i16> @test_masked_32xi16_pe
 }
 
 define <32 x i16> @test_masked_z_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_low_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
 }
 define <32 x i16> @test_masked_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_32xi16_perm_high_mask4:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi16_perm_high_mask4:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_high_mask4:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
@@ -5009,25 +7823,40 @@ define <32 x i16> @test_masked_32xi16_pe
 }
 
 define <32 x i16> @test_masked_z_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask4:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask4:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_high_mask4:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
 }
 define <32 x i16> @test_masked_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_32xi16_perm_low_mask5:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi16_perm_low_mask5:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_low_mask5:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
@@ -5035,33 +7864,53 @@ define <32 x i16> @test_masked_32xi16_pe
 }
 
 define <32 x i16> @test_masked_z_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask5:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask5:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_low_mask5:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
 }
 define <32 x i16> @test_32xi16_perm_high_mask6(<32 x i16> %vec) {
-; CHECK-LABEL: test_32xi16_perm_high_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_32xi16_perm_high_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi16_perm_high_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
   ret <32 x i16> %res
 }
 define <32 x i16> @test_masked_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_32xi16_perm_high_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi16_perm_high_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_high_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
@@ -5069,25 +7918,40 @@ define <32 x i16> @test_masked_32xi16_pe
 }
 
 define <32 x i16> @test_masked_z_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_high_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
 }
 define <32 x i16> @test_masked_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_32xi16_perm_low_mask7:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi16_perm_low_mask7:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_low_mask7:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
@@ -5095,33 +7959,52 @@ define <32 x i16> @test_masked_32xi16_pe
 }
 
 define <32 x i16> @test_masked_z_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask7:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask7:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_low_mask7:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
   %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
   ret <32 x i16> %res
 }
 define <32 x i16> @test_32xi16_perm_high_mem_mask0(<32 x i16>* %vp) {
-; CHECK-LABEL: test_32xi16_perm_high_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_32xi16_perm_high_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi16_perm_high_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
   ret <32 x i16> %res
 }
 define <32 x i16> @test_masked_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -5130,12 +8013,19 @@ define <32 x i16> @test_masked_32xi16_pe
 }
 
 define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -5144,12 +8034,19 @@ define <32 x i16> @test_masked_z_32xi16_
 }
 
 define <32 x i16> @test_masked_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 11, i32 11, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 19, i32 19, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 27, i32 27, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -5158,12 +8055,19 @@ define <32 x i16> @test_masked_32xi16_pe
 }
 
 define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 11, i32 11, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 19, i32 19, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 27, i32 27, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -5172,12 +8076,19 @@ define <32 x i16> @test_masked_z_32xi16_
 }
 
 define <32 x i16> @test_masked_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 7, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 15, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 23, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 31, i32 30, i32 28>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -5186,12 +8097,19 @@ define <32 x i16> @test_masked_32xi16_pe
 }
 
 define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 7, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 15, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 23, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 31, i32 30, i32 28>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -5200,21 +8118,33 @@ define <32 x i16> @test_masked_z_32xi16_
 }
 
 define <32 x i16> @test_32xi16_perm_low_mem_mask3(<32 x i16>* %vp) {
-; CHECK-LABEL: test_32xi16_perm_low_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_32xi16_perm_low_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi16_perm_low_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
   ret <32 x i16> %res
 }
 define <32 x i16> @test_masked_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -5223,12 +8153,19 @@ define <32 x i16> @test_masked_32xi16_pe
 }
 
 define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -5237,12 +8174,19 @@ define <32 x i16> @test_masked_z_32xi16_
 }
 
 define <32 x i16> @test_masked_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask4:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask4:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask4:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 14, i32 13, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 22, i32 21, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 30, i32 29>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -5251,12 +8195,19 @@ define <32 x i16> @test_masked_32xi16_pe
 }
 
 define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask4:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask4:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask4:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 14, i32 13, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 22, i32 21, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 30, i32 29>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -5265,13 +8216,21 @@ define <32 x i16> @test_masked_z_32xi16_
 }
 
 define <32 x i16> @test_masked_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask5:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqu16 %zmm2, %zmm0 {%k1}
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask5:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
+; GENERIC-NEXT:    vmovdqu16 %zmm2, %zmm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask5:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovdqu16 %zmm2, %zmm0 {%k1}
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 16, i32 17, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 24, i32 25, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -5280,13 +8239,21 @@ define <32 x i16> @test_masked_32xi16_pe
 }
 
 define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask5:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask5:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1
+; GENERIC-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask5:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 16, i32 17, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 24, i32 25, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -5295,21 +8262,33 @@ define <32 x i16> @test_masked_z_32xi16_
 }
 
 define <32 x i16> @test_32xi16_perm_high_mem_mask6(<32 x i16>* %vp) {
-; CHECK-LABEL: test_32xi16_perm_high_mem_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_32xi16_perm_high_mem_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_32xi16_perm_high_mem_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
   ret <32 x i16> %res
 }
 define <32 x i16> @test_masked_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -5318,12 +8297,19 @@ define <32 x i16> @test_masked_32xi16_pe
 }
 
 define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask6:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask6:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
+; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask6:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -5332,12 +8318,19 @@ define <32 x i16> @test_masked_z_32xi16_
 }
 
 define <32 x i16> @test_masked_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask7:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask7:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask7:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 1, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 17, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 25, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -5346,12 +8339,19 @@ define <32 x i16> @test_masked_32xi16_pe
 }
 
 define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %mask) {
-; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask7:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask7:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
+; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask7:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <32 x i16>, <32 x i16>* %vp
   %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 1, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 17, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 25, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <32 x i16> %mask, zeroinitializer
@@ -5360,21 +8360,34 @@ define <32 x i16> @test_masked_z_32xi16_
 }
 
 define <4 x i32> @test_4xi32_perm_mask0(<4 x i32> %vec) {
-; CHECK-LABEL: test_4xi32_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,3,0] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi32_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,3,0] sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi32_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,3,0] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
   ret <4 x i32> %res
 }
 define <4 x i32> @test_masked_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
-; CHECK-LABEL: test_masked_4xi32_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_4xi32_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0]
+; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi32_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
@@ -5382,25 +8395,40 @@ define <4 x i32> @test_masked_4xi32_perm
 }
 
 define <4 x i32> @test_masked_z_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_4xi32_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_4xi32_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi32_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 define <4 x i32> @test_masked_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
-; CHECK-LABEL: test_masked_4xi32_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_4xi32_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0]
+; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi32_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
@@ -5408,25 +8436,40 @@ define <4 x i32> @test_masked_4xi32_perm
 }
 
 define <4 x i32> @test_masked_z_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_4xi32_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_4xi32_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi32_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 define <4 x i32> @test_masked_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
-; CHECK-LABEL: test_masked_4xi32_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_4xi32_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0]
+; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi32_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
@@ -5434,33 +8477,53 @@ define <4 x i32> @test_masked_4xi32_perm
 }
 
 define <4 x i32> @test_masked_z_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_4xi32_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_4xi32_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi32_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 define <4 x i32> @test_4xi32_perm_mask3(<4 x i32> %vec) {
-; CHECK-LABEL: test_4xi32_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,3] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi32_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,3] sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi32_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,3] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
   ret <4 x i32> %res
 }
 define <4 x i32> @test_masked_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
-; CHECK-LABEL: test_masked_4xi32_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_4xi32_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3]
+; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi32_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
@@ -5468,33 +8531,52 @@ define <4 x i32> @test_masked_4xi32_perm
 }
 
 define <4 x i32> @test_masked_z_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_4xi32_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_4xi32_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi32_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 define <4 x i32> @test_4xi32_perm_mem_mask0(<4 x i32>* %vp) {
-; CHECK-LABEL: test_4xi32_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,3,3] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi32_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,3,3] sched: [7:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi32_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,3,3] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
   %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
   ret <4 x i32> %res
 }
 define <4 x i32> @test_masked_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
-; CHECK-LABEL: test_masked_4xi32_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi32_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -5503,12 +8585,19 @@ define <4 x i32> @test_masked_4xi32_perm
 }
 
 define <4 x i32> @test_masked_z_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -5517,12 +8606,19 @@ define <4 x i32> @test_masked_z_4xi32_pe
 }
 
 define <4 x i32> @test_masked_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
-; CHECK-LABEL: test_masked_4xi32_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi32_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 1>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -5531,12 +8627,19 @@ define <4 x i32> @test_masked_4xi32_perm
 }
 
 define <4 x i32> @test_masked_z_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 1>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -5545,12 +8648,19 @@ define <4 x i32> @test_masked_z_4xi32_pe
 }
 
 define <4 x i32> @test_masked_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
-; CHECK-LABEL: test_masked_4xi32_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi32_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 0, i32 1>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -5559,12 +8669,19 @@ define <4 x i32> @test_masked_4xi32_perm
 }
 
 define <4 x i32> @test_masked_z_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 0, i32 1>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -5573,21 +8690,33 @@ define <4 x i32> @test_masked_z_4xi32_pe
 }
 
 define <4 x i32> @test_4xi32_perm_mem_mask3(<4 x i32>* %vp) {
-; CHECK-LABEL: test_4xi32_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,0,1,0] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi32_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,0,1,0] sched: [7:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi32_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,0,1,0] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
   %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
   ret <4 x i32> %res
 }
 define <4 x i32> @test_masked_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
-; CHECK-LABEL: test_masked_4xi32_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_4xi32_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -5596,12 +8725,19 @@ define <4 x i32> @test_masked_4xi32_perm
 }
 
 define <4 x i32> @test_masked_z_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %mask) {
-; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <4 x i32>, <4 x i32>* %vp
   %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -5610,21 +8746,34 @@ define <4 x i32> @test_masked_z_4xi32_pe
 }
 
 define <8 x i32> @test2_8xi32_perm_mask0(<8 x i32> %vec) {
-; CHECK-LABEL: test2_8xi32_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_8xi32_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_8xi32_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
   ret <8 x i32> %res
 }
 define <8 x i32> @test2_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test2_masked_8xi32_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_8xi32_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4]
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_8xi32_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
@@ -5632,25 +8781,40 @@ define <8 x i32> @test2_masked_8xi32_per
 }
 
 define <8 x i32> @test2_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) {
-; CHECK-LABEL: test2_masked_z_8xi32_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_8xi32_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 define <8 x i32> @test2_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test2_masked_8xi32_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_8xi32_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7]
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_8xi32_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
@@ -5658,25 +8822,40 @@ define <8 x i32> @test2_masked_8xi32_per
 }
 
 define <8 x i32> @test2_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) {
-; CHECK-LABEL: test2_masked_z_8xi32_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_8xi32_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 define <8 x i32> @test2_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test2_masked_8xi32_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_8xi32_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7]
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_8xi32_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
@@ -5684,33 +8863,53 @@ define <8 x i32> @test2_masked_8xi32_per
 }
 
 define <8 x i32> @test2_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) {
-; CHECK-LABEL: test2_masked_z_8xi32_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_8xi32_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 define <8 x i32> @test2_8xi32_perm_mask3(<8 x i32> %vec) {
-; CHECK-LABEL: test2_8xi32_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_8xi32_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_8xi32_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
   ret <8 x i32> %res
 }
 define <8 x i32> @test2_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test2_masked_8xi32_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_8xi32_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4]
+; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_8xi32_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
@@ -5718,33 +8917,52 @@ define <8 x i32> @test2_masked_8xi32_per
 }
 
 define <8 x i32> @test2_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) {
-; CHECK-LABEL: test2_masked_z_8xi32_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_8xi32_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 define <8 x i32> @test2_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
-; CHECK-LABEL: test2_8xi32_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_8xi32_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpermilps {{.*#+}} ymm0 = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_8xi32_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermilps {{.*#+}} ymm0 = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
   ret <8 x i32> %res
 }
 define <8 x i32> @test2_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test2_masked_8xi32_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_8xi32_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -5753,12 +8971,19 @@ define <8 x i32> @test2_masked_8xi32_per
 }
 
 define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %mask) {
-; CHECK-LABEL: test2_masked_z_8xi32_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -5767,12 +8992,19 @@ define <8 x i32> @test2_masked_z_8xi32_p
 }
 
 define <8 x i32> @test2_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test2_masked_8xi32_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_8xi32_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -5781,12 +9013,19 @@ define <8 x i32> @test2_masked_8xi32_per
 }
 
 define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %mask) {
-; CHECK-LABEL: test2_masked_z_8xi32_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -5795,12 +9034,19 @@ define <8 x i32> @test2_masked_z_8xi32_p
 }
 
 define <8 x i32> @test2_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test2_masked_8xi32_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_8xi32_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 1, i32 7, i32 6, i32 7, i32 5>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -5809,12 +9055,19 @@ define <8 x i32> @test2_masked_8xi32_per
 }
 
 define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %mask) {
-; CHECK-LABEL: test2_masked_z_8xi32_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 1, i32 7, i32 6, i32 7, i32 5>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -5823,21 +9076,33 @@ define <8 x i32> @test2_masked_z_8xi32_p
 }
 
 define <8 x i32> @test2_8xi32_perm_mem_mask3(<8 x i32>* %vp) {
-; CHECK-LABEL: test2_8xi32_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_8xi32_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpermilps {{.*#+}} ymm0 = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_8xi32_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermilps {{.*#+}} ymm0 = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
   ret <8 x i32> %res
 }
 define <8 x i32> @test2_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test2_masked_8xi32_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_8xi32_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -5846,12 +9111,19 @@ define <8 x i32> @test2_masked_8xi32_per
 }
 
 define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %mask) {
-; CHECK-LABEL: test2_masked_z_8xi32_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <8 x i32>, <8 x i32>* %vp
   %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -5860,21 +9132,34 @@ define <8 x i32> @test2_masked_z_8xi32_p
 }
 
 define <16 x i32> @test2_16xi32_perm_mask0(<16 x i32> %vec) {
-; CHECK-LABEL: test2_16xi32_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_16xi32_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_16xi32_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
   ret <16 x i32> %res
 }
 define <16 x i32> @test2_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test2_masked_16xi32_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_16xi32_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_16xi32_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
@@ -5882,25 +9167,40 @@ define <16 x i32> @test2_masked_16xi32_p
 }
 
 define <16 x i32> @test2_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
-; CHECK-LABEL: test2_masked_z_16xi32_perm_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_16xi32_perm_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 define <16 x i32> @test2_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test2_masked_16xi32_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_16xi32_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_16xi32_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
@@ -5908,25 +9208,40 @@ define <16 x i32> @test2_masked_16xi32_p
 }
 
 define <16 x i32> @test2_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) {
-; CHECK-LABEL: test2_masked_z_16xi32_perm_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_16xi32_perm_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 define <16 x i32> @test2_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test2_masked_16xi32_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_16xi32_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_16xi32_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
@@ -5934,33 +9249,53 @@ define <16 x i32> @test2_masked_16xi32_p
 }
 
 define <16 x i32> @test2_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) {
-; CHECK-LABEL: test2_masked_z_16xi32_perm_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_16xi32_perm_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 define <16 x i32> @test2_16xi32_perm_mask3(<16 x i32> %vec) {
-; CHECK-LABEL: test2_16xi32_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_16xi32_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_16xi32_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
   ret <16 x i32> %res
 }
 define <16 x i32> @test2_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test2_masked_16xi32_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_16xi32_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_16xi32_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
@@ -5968,33 +9303,52 @@ define <16 x i32> @test2_masked_16xi32_p
 }
 
 define <16 x i32> @test2_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) {
-; CHECK-LABEL: test2_masked_z_16xi32_perm_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_16xi32_perm_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 define <16 x i32> @test2_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
-; CHECK-LABEL: test2_16xi32_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_16xi32_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpermilps {{.*#+}} zmm0 = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_16xi32_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermilps {{.*#+}} zmm0 = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
   ret <16 x i32> %res
 }
 define <16 x i32> @test2_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test2_masked_16xi32_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_16xi32_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -6003,12 +9357,19 @@ define <16 x i32> @test2_masked_16xi32_p
 }
 
 define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) {
-; CHECK-LABEL: test2_masked_z_16xi32_perm_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -6017,12 +9378,19 @@ define <16 x i32> @test2_masked_z_16xi32
 }
 
 define <16 x i32> @test2_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test2_masked_16xi32_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_16xi32_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 2, i32 5, i32 4, i32 4, i32 6, i32 9, i32 8, i32 8, i32 10, i32 13, i32 12, i32 12, i32 14>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -6031,12 +9399,19 @@ define <16 x i32> @test2_masked_16xi32_p
 }
 
 define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) {
-; CHECK-LABEL: test2_masked_z_16xi32_perm_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 2, i32 5, i32 4, i32 4, i32 6, i32 9, i32 8, i32 8, i32 10, i32 13, i32 12, i32 12, i32 14>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -6045,12 +9420,19 @@ define <16 x i32> @test2_masked_z_16xi32
 }
 
 define <16 x i32> @test2_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test2_masked_16xi32_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_16xi32_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 1, i32 2, i32 6, i32 4, i32 5, i32 6, i32 10, i32 8, i32 9, i32 10, i32 14, i32 12, i32 13, i32 14>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -6059,12 +9441,19 @@ define <16 x i32> @test2_masked_16xi32_p
 }
 
 define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) {
-; CHECK-LABEL: test2_masked_z_16xi32_perm_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 1, i32 2, i32 6, i32 4, i32 5, i32 6, i32 10, i32 8, i32 9, i32 10, i32 14, i32 12, i32 13, i32 14>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -6073,21 +9462,33 @@ define <16 x i32> @test2_masked_z_16xi32
 }
 
 define <16 x i32> @test2_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
-; CHECK-LABEL: test2_16xi32_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_16xi32_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpermilps {{.*#+}} zmm0 = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_16xi32_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpermilps {{.*#+}} zmm0 = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
   ret <16 x i32> %res
 }
 define <16 x i32> @test2_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test2_masked_16xi32_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_16xi32_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -6096,12 +9497,19 @@ define <16 x i32> @test2_masked_16xi32_p
 }
 
 define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) {
-; CHECK-LABEL: test2_masked_z_16xi32_perm_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
+; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec = load <16 x i32>, <16 x i32>* %vp
   %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -6110,21 +9518,34 @@ define <16 x i32> @test2_masked_z_16xi32
 }
 
 define <8 x float> @test2_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2) {
-; CHECK-LABEL: test2_8xfloat_shuff_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_8xfloat_shuff_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_8xfloat_shuff_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   ret <8 x float> %res
 }
 define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
-; CHECK-LABEL: test2_8xfloat_masked_shuff_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1
+; GENERIC-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_8xfloat_masked_shuff_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
@@ -6132,26 +9553,42 @@ define <8 x float> @test2_8xfloat_masked
 }
 
 define <8 x float> @test2_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test2_8xfloat_zero_masked_shuff_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1
+; GENERIC-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
-; CHECK-LABEL: test2_8xfloat_masked_shuff_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1
+; GENERIC-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_8xfloat_masked_shuff_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
@@ -6159,26 +9596,42 @@ define <8 x float> @test2_8xfloat_masked
 }
 
 define <8 x float> @test2_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test2_8xfloat_zero_masked_shuff_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1
+; GENERIC-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
-; CHECK-LABEL: test2_8xfloat_masked_shuff_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1
+; GENERIC-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_8xfloat_masked_shuff_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
@@ -6186,34 +9639,55 @@ define <8 x float> @test2_8xfloat_masked
 }
 
 define <8 x float> @test2_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test2_8xfloat_zero_masked_shuff_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1
+; GENERIC-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 define <8 x float> @test2_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2) {
-; CHECK-LABEL: test2_8xfloat_shuff_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_8xfloat_shuff_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_8xfloat_shuff_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   ret <8 x float> %res
 }
 define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
-; CHECK-LABEL: test2_8xfloat_masked_shuff_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1
+; GENERIC-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test2_8xfloat_masked_shuff_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
@@ -6221,35 +9695,56 @@ define <8 x float> @test2_8xfloat_masked
 }
 
 define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1
+; GENERIC-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_shuff_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
-; CHECK-LABEL: test_8xfloat_shuff_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_shuff_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_shuff_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   ret <8 x float> %res
 }
 define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -6258,13 +9753,21 @@ define <8 x float> @test_8xfloat_masked_
 }
 
 define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -6273,13 +9776,21 @@ define <8 x float> @test_8xfloat_zero_ma
 }
 
 define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -6288,13 +9799,21 @@ define <8 x float> @test_8xfloat_masked_
 }
 
 define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -6303,13 +9822,21 @@ define <8 x float> @test_8xfloat_zero_ma
 }
 
 define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -6318,13 +9845,21 @@ define <8 x float> @test_8xfloat_masked_
 }
 
 define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -6333,22 +9868,35 @@ define <8 x float> @test_8xfloat_zero_ma
 }
 
 define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
-; CHECK-LABEL: test_8xfloat_shuff_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_shuff_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_shuff_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   ret <8 x float> %res
 }
 define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -6357,13 +9905,21 @@ define <8 x float> @test_8xfloat_masked_
 }
 
 define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -6372,21 +9928,34 @@ define <8 x float> @test_8xfloat_zero_ma
 }
 
 define <16 x float> @test_16xfloat_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xfloat_shuff_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],zmm1[2,3,6,7] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_shuff_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],zmm1[2,3,6,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_shuff_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],zmm1[2,3,6,7] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   ret <16 x float> %res
 }
 define <16 x float> @test_16xfloat_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xfloat_masked_shuff_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
-; CHECK-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_masked_shuff_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15]
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_shuff_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
+; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
@@ -6394,25 +9963,40 @@ define <16 x float> @test_16xfloat_maske
 }
 
 define <16 x float> @test_16xfloat_zero_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 define <16 x float> @test_16xfloat_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xfloat_masked_shuff_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [3:1.00]
-; CHECK-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_masked_shuff_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15]
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_shuff_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [3:1.00]
+; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
@@ -6420,25 +10004,40 @@ define <16 x float> @test_16xfloat_maske
 }
 
 define <16 x float> @test_16xfloat_zero_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 define <16 x float> @test_16xfloat_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xfloat_masked_shuff_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [3:1.00]
-; CHECK-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_masked_shuff_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7]
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_shuff_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [3:1.00]
+; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
@@ -6446,33 +10045,53 @@ define <16 x float> @test_16xfloat_maske
 }
 
 define <16 x float> @test_16xfloat_zero_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 define <16 x float> @test_16xfloat_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2) {
-; CHECK-LABEL: test_16xfloat_shuff_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7],zmm1[0,1,4,5] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_shuff_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7],zmm1[0,1,4,5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_shuff_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7],zmm1[0,1,4,5] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
   ret <16 x float> %res
 }
 define <16 x float> @test_16xfloat_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xfloat_masked_shuff_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00]
-; CHECK-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_masked_shuff_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11]
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_shuff_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00]
+; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
@@ -6480,34 +10099,54 @@ define <16 x float> @test_16xfloat_maske
 }
 
 define <16 x float> @test_16xfloat_zero_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 define <16 x float> @test_16xfloat_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
-; CHECK-LABEL: test_16xfloat_shuff_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5],mem[4,5,2,3] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_shuff_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5],mem[4,5,2,3]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_shuff_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5],mem[4,5,2,3] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   ret <16 x float> %res
 }
 define <16 x float> @test_16xfloat_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7]
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
+; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -6516,12 +10155,19 @@ define <16 x float> @test_16xfloat_maske
 }
 
 define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -6530,13 +10176,21 @@ define <16 x float> @test_16xfloat_zero_
 }
 
 define <16 x float> @test_16xfloat_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7]
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
+; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -6545,12 +10199,19 @@ define <16 x float> @test_16xfloat_maske
 }
 
 define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -6559,13 +10220,21 @@ define <16 x float> @test_16xfloat_zero_
 }
 
 define <16 x float> @test_16xfloat_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [10:1.00]
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11]
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [10:1.00]
+; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 24, i32 25, i32 26, i32 27>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -6574,12 +10243,19 @@ define <16 x float> @test_16xfloat_maske
 }
 
 define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 24, i32 25, i32 26, i32 27>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -6588,22 +10264,35 @@ define <16 x float> @test_16xfloat_zero_
 }
 
 define <16 x float> @test_16xfloat_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
-; CHECK-LABEL: test_16xfloat_shuff_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,6,7] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_shuff_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,6,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_shuff_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,6,7] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
   ret <16 x float> %res
 }
 define <16 x float> @test_16xfloat_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
-; CHECK-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15]
+; GENERIC-NEXT:    vmovaps %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
+; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -6612,12 +10301,19 @@ define <16 x float> @test_16xfloat_maske
 }
 
 define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x float>, <16 x float>* %vec2p
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -6626,21 +10322,34 @@ define <16 x float> @test_16xfloat_zero_
 }
 
 define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2) {
-; CHECK-LABEL: test_4xdouble_shuff_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xdouble_shuff_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_shuff_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   ret <4 x double> %res
 }
 define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xdouble_masked_shuff_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xdouble_masked_shuff_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1
+; GENERIC-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_shuff_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
@@ -6648,26 +10357,42 @@ define <4 x double> @test_4xdouble_maske
 }
 
 define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1
+; GENERIC-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xdouble_masked_shuff_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xdouble_masked_shuff_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1
+; GENERIC-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_shuff_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
@@ -6675,26 +10400,42 @@ define <4 x double> @test_4xdouble_maske
 }
 
 define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1
+; GENERIC-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xdouble_masked_shuff_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xdouble_masked_shuff_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1
+; GENERIC-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_shuff_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
@@ -6702,34 +10443,55 @@ define <4 x double> @test_4xdouble_maske
 }
 
 define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1
+; GENERIC-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2) {
-; CHECK-LABEL: test_4xdouble_shuff_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xdouble_shuff_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_shuff_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   ret <4 x double> %res
 }
 define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xdouble_masked_shuff_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xdouble_masked_shuff_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1
+; GENERIC-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_shuff_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
@@ -6737,35 +10499,56 @@ define <4 x double> @test_4xdouble_maske
 }
 
 define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1
+; GENERIC-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
   ret <4 x double> %res
 }
 define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
-; CHECK-LABEL: test_4xdouble_shuff_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xdouble_shuff_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_shuff_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   ret <4 x double> %res
 }
 define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -6774,13 +10557,21 @@ define <4 x double> @test_4xdouble_maske
 }
 
 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -6789,13 +10580,21 @@ define <4 x double> @test_4xdouble_zero_
 }
 
 define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -6804,13 +10603,21 @@ define <4 x double> @test_4xdouble_maske
 }
 
 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -6819,13 +10626,21 @@ define <4 x double> @test_4xdouble_zero_
 }
 
 define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -6834,13 +10649,21 @@ define <4 x double> @test_4xdouble_maske
 }
 
 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -6849,22 +10672,35 @@ define <4 x double> @test_4xdouble_zero_
 }
 
 define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
-; CHECK-LABEL: test_4xdouble_shuff_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xdouble_shuff_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_shuff_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   ret <4 x double> %res
 }
 define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -6873,13 +10709,21 @@ define <4 x double> @test_4xdouble_maske
 }
 
 define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x double>, <4 x double>* %vec2p
   %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -6888,21 +10732,34 @@ define <4 x double> @test_4xdouble_zero_
 }
 
 define <8 x double> @test_8xdouble_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2) {
-; CHECK-LABEL: test_8xdouble_shuff_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xdouble_shuff_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,2,3],zmm1[6,7,0,1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_shuff_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
   ret <8 x double> %res
 }
 define <8 x double> @test_8xdouble_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xdouble_masked_shuff_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
-; CHECK-NEXT:    vmovapd %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xdouble_masked_shuff_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1]
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_shuff_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
+; SKX-NEXT:    vmovapd %zmm2, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
@@ -6910,25 +10767,40 @@ define <8 x double> @test_8xdouble_maske
 }
 
 define <8 x double> @test_8xdouble_zero_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 define <8 x double> @test_8xdouble_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xdouble_masked_shuff_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00]
-; CHECK-NEXT:    vmovapd %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xdouble_masked_shuff_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5]
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_shuff_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00]
+; SKX-NEXT:    vmovapd %zmm2, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
@@ -6936,25 +10808,40 @@ define <8 x double> @test_8xdouble_maske
 }
 
 define <8 x double> @test_8xdouble_zero_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 define <8 x double> @test_8xdouble_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xdouble_masked_shuff_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00]
-; CHECK-NEXT:    vmovapd %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xdouble_masked_shuff_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1]
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_shuff_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00]
+; SKX-NEXT:    vmovapd %zmm2, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
@@ -6962,33 +10849,53 @@ define <8 x double> @test_8xdouble_maske
 }
 
 define <8 x double> @test_8xdouble_zero_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 define <8 x double> @test_8xdouble_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2) {
-; CHECK-LABEL: test_8xdouble_shuff_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xdouble_shuff_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,2,3]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_shuff_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
   ret <8 x double> %res
 }
 define <8 x double> @test_8xdouble_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xdouble_masked_shuff_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
-; CHECK-NEXT:    vmovapd %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xdouble_masked_shuff_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3]
+; GENERIC-NEXT:    vmovapd %zmm2, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_shuff_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
+; SKX-NEXT:    vmovapd %zmm2, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
@@ -6996,34 +10903,54 @@ define <8 x double> @test_8xdouble_maske
 }
 
 define <8 x double> @test_8xdouble_zero_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
   ret <8 x double> %res
 }
 define <8 x double> @test_8xdouble_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
-; CHECK-LABEL: test_8xdouble_shuff_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xdouble_shuff_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],mem[0,1,0,1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_shuff_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
   ret <8 x double> %res
 }
 define <8 x double> @test_8xdouble_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
-; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -7032,12 +10959,19 @@ define <8 x double> @test_8xdouble_maske
 }
 
 define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -7046,13 +10980,21 @@ define <8 x double> @test_8xdouble_zero_
 }
 
 define <8 x double> @test_8xdouble_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00]
-; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -7061,12 +11003,19 @@ define <8 x double> @test_8xdouble_maske
 }
 
 define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -7075,13 +11024,21 @@ define <8 x double> @test_8xdouble_zero_
 }
 
 define <8 x double> @test_8xdouble_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00]
-; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 12, i32 13>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -7090,12 +11047,19 @@ define <8 x double> @test_8xdouble_maske
 }
 
 define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 12, i32 13>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -7104,22 +11068,35 @@ define <8 x double> @test_8xdouble_zero_
 }
 
 define <8 x double> @test_8xdouble_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
-; CHECK-LABEL: test_8xdouble_shuff_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xdouble_shuff_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[4,5,0,1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_shuff_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
   ret <8 x double> %res
 }
 define <8 x double> @test_8xdouble_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
-; CHECK-NEXT:    vmovapd %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1]
+; GENERIC-NEXT:    vmovapd %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
+; SKX-NEXT:    vmovapd %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -7128,12 +11105,19 @@ define <8 x double> @test_8xdouble_maske
 }
 
 define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x double>, <8 x double>* %vec2p
   %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -7142,21 +11126,34 @@ define <8 x double> @test_8xdouble_zero_
 }
 
 define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) {
-; CHECK-LABEL: test_8xi32_shuff_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi32_shuff_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_shuff_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   ret <8 x i32> %res
 }
 define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xi32_masked_shuff_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi32_masked_shuff_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1
+; GENERIC-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_masked_shuff_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
@@ -7164,26 +11161,42 @@ define <8 x i32> @test_8xi32_masked_shuf
 }
 
 define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1
+; GENERIC-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_zero_masked_shuff_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xi32_masked_shuff_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi32_masked_shuff_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1
+; GENERIC-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_masked_shuff_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
@@ -7191,26 +11204,42 @@ define <8 x i32> @test_8xi32_masked_shuf
 }
 
 define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1
+; GENERIC-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_zero_masked_shuff_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xi32_masked_shuff_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi32_masked_shuff_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1
+; GENERIC-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_masked_shuff_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
@@ -7218,34 +11247,55 @@ define <8 x i32> @test_8xi32_masked_shuf
 }
 
 define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1
+; GENERIC-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_zero_masked_shuff_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) {
-; CHECK-LABEL: test_8xi32_shuff_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi32_shuff_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_shuff_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   ret <8 x i32> %res
 }
 define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xi32_masked_shuff_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi32_masked_shuff_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1
+; GENERIC-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_masked_shuff_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
@@ -7253,35 +11303,56 @@ define <8 x i32> @test_8xi32_masked_shuf
 }
 
 define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1
+; GENERIC-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_zero_masked_shuff_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) {
-; CHECK-LABEL: test_8xi32_shuff_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi32_shuff_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_shuff_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   ret <8 x i32> %res
 }
 define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_masked_shuff_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -7290,13 +11361,21 @@ define <8 x i32> @test_8xi32_masked_shuf
 }
 
 define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -7305,13 +11384,21 @@ define <8 x i32> @test_8xi32_zero_masked
 }
 
 define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_masked_shuff_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -7320,13 +11407,21 @@ define <8 x i32> @test_8xi32_masked_shuf
 }
 
 define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -7335,13 +11430,21 @@ define <8 x i32> @test_8xi32_zero_masked
 }
 
 define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_masked_shuff_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -7350,13 +11453,21 @@ define <8 x i32> @test_8xi32_masked_shuf
 }
 
 define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -7365,22 +11476,35 @@ define <8 x i32> @test_8xi32_zero_masked
 }
 
 define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) {
-; CHECK-LABEL: test_8xi32_shuff_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi32_shuff_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_shuff_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   ret <8 x i32> %res
 }
 define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_masked_shuff_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -7389,13 +11513,21 @@ define <8 x i32> @test_8xi32_masked_shuf
 }
 
 define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i32>, <8 x i32>* %vec2p
   %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -7404,21 +11536,34 @@ define <8 x i32> @test_8xi32_zero_masked
 }
 
 define <16 x i32> @test_16xi32_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2) {
-; CHECK-LABEL: test_16xi32_shuff_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,6,7] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi32_shuff_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,6,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_shuff_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,6,7] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   ret <16 x i32> %res
 }
 define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xi32_masked_shuff_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi32_masked_shuff_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15]
+; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_masked_shuff_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
@@ -7426,25 +11571,40 @@ define <16 x i32> @test_16xi32_masked_sh
 }
 
 define <16 x i32> @test_16xi32_zero_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_zero_masked_shuff_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xi32_masked_shuff_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi32_masked_shuff_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7]
+; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_masked_shuff_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
@@ -7452,25 +11612,40 @@ define <16 x i32> @test_16xi32_masked_sh
 }
 
 define <16 x i32> @test_16xi32_zero_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_zero_masked_shuff_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xi32_masked_shuff_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi32_masked_shuff_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3]
+; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_masked_shuff_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
@@ -7478,33 +11653,53 @@ define <16 x i32> @test_16xi32_masked_sh
 }
 
 define <16 x i32> @test_16xi32_zero_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_zero_masked_shuff_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 define <16 x i32> @test_16xi32_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2) {
-; CHECK-LABEL: test_16xi32_shuff_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],zmm1[4,5,2,3] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi32_shuff_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],zmm1[4,5,2,3]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_shuff_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],zmm1[4,5,2,3] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   ret <16 x i32> %res
 }
 define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xi32_masked_shuff_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi32_masked_shuff_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7]
+; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_masked_shuff_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
@@ -7512,34 +11707,54 @@ define <16 x i32> @test_16xi32_masked_sh
 }
 
 define <16 x i32> @test_16xi32_zero_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_zero_masked_shuff_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 define <16 x i32> @test_16xi32_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p) {
-; CHECK-LABEL: test_16xi32_shuff_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],mem[4,5,0,1] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi32_shuff_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],mem[4,5,0,1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_shuff_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],mem[4,5,0,1] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
   ret <16 x i32> %res
 }
 define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_masked_shuff_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -7548,12 +11763,19 @@ define <16 x i32> @test_16xi32_masked_sh
 }
 
 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -7562,13 +11784,21 @@ define <16 x i32> @test_16xi32_zero_mask
 }
 
 define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_masked_shuff_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -7577,12 +11807,19 @@ define <16 x i32> @test_16xi32_masked_sh
 }
 
 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -7591,13 +11828,21 @@ define <16 x i32> @test_16xi32_zero_mask
 }
 
 define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_masked_shuff_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -7606,12 +11851,19 @@ define <16 x i32> @test_16xi32_masked_sh
 }
 
 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -7620,22 +11872,35 @@ define <16 x i32> @test_16xi32_zero_mask
 }
 
 define <16 x i32> @test_16xi32_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p) {
-; CHECK-LABEL: test_16xi32_shuff_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[2,3,6,7] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi32_shuff_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[2,3,6,7]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_shuff_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[2,3,6,7] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   ret <16 x i32> %res
 }
 define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_masked_shuff_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -7644,12 +11909,19 @@ define <16 x i32> @test_16xi32_masked_sh
 }
 
 define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <16 x i32>, <16 x i32>* %vec2p
   %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
@@ -7658,21 +11930,34 @@ define <16 x i32> @test_16xi32_zero_mask
 }
 
 define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) {
-; CHECK-LABEL: test_4xi64_shuff_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi64_shuff_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_shuff_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   ret <4 x i64> %res
 }
 define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xi64_masked_shuff_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi64_masked_shuff_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1
+; GENERIC-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_masked_shuff_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
@@ -7680,26 +11965,42 @@ define <4 x i64> @test_4xi64_masked_shuf
 }
 
 define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1
+; GENERIC-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_zero_masked_shuff_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xi64_masked_shuff_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi64_masked_shuff_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1
+; GENERIC-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_masked_shuff_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
@@ -7707,26 +12008,42 @@ define <4 x i64> @test_4xi64_masked_shuf
 }
 
 define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1
+; GENERIC-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_zero_masked_shuff_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xi64_masked_shuff_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi64_masked_shuff_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1
+; GENERIC-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_masked_shuff_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
@@ -7734,34 +12051,55 @@ define <4 x i64> @test_4xi64_masked_shuf
 }
 
 define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1
+; GENERIC-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_zero_masked_shuff_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) {
-; CHECK-LABEL: test_4xi64_shuff_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi64_shuff_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_shuff_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   ret <4 x i64> %res
 }
 define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xi64_masked_shuff_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi64_masked_shuff_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1
+; GENERIC-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_masked_shuff_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
@@ -7769,35 +12107,56 @@ define <4 x i64> @test_4xi64_masked_shuf
 }
 
 define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
+; GENERIC-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1
+; GENERIC-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_zero_masked_shuff_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
   ret <4 x i64> %res
 }
 define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) {
-; CHECK-LABEL: test_4xi64_shuff_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi64_shuff_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_shuff_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   ret <4 x i64> %res
 }
 define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_masked_shuff_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -7806,13 +12165,21 @@ define <4 x i64> @test_4xi64_masked_shuf
 }
 
 define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -7821,13 +12188,21 @@ define <4 x i64> @test_4xi64_zero_masked
 }
 
 define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_masked_shuff_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -7836,13 +12211,21 @@ define <4 x i64> @test_4xi64_masked_shuf
 }
 
 define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -7851,13 +12234,21 @@ define <4 x i64> @test_4xi64_zero_masked
 }
 
 define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_masked_shuff_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -7866,13 +12257,21 @@ define <4 x i64> @test_4xi64_masked_shuf
 }
 
 define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -7881,22 +12280,35 @@ define <4 x i64> @test_4xi64_zero_masked
 }
 
 define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) {
-; CHECK-LABEL: test_4xi64_shuff_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi64_shuff_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_shuff_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   ret <4 x i64> %res
 }
 define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_masked_shuff_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -7905,13 +12317,21 @@ define <4 x i64> @test_4xi64_masked_shuf
 }
 
 define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
-; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00]
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x i64>, <4 x i64>* %vec2p
   %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   %cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -7920,21 +12340,34 @@ define <4 x i64> @test_4xi64_zero_masked
 }
 
 define <8 x i64> @test_8xi64_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2) {
-; CHECK-LABEL: test_8xi64_shuff_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi64_shuff_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,4,5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_shuff_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
   ret <8 x i64> %res
 }
 define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xi64_masked_shuff_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi64_masked_shuff_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5]
+; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_masked_shuff_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
@@ -7942,25 +12375,40 @@ define <8 x i64> @test_8xi64_masked_shuf
 }
 
 define <8 x i64> @test_8xi64_zero_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_zero_masked_shuff_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xi64_masked_shuff_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi64_masked_shuff_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5]
+; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_masked_shuff_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
@@ -7968,25 +12416,40 @@ define <8 x i64> @test_8xi64_masked_shuf
 }
 
 define <8 x i64> @test_8xi64_zero_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_zero_masked_shuff_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xi64_masked_shuff_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi64_masked_shuff_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1]
+; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_masked_shuff_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
@@ -7994,33 +12457,53 @@ define <8 x i64> @test_8xi64_masked_shuf
 }
 
 define <8 x i64> @test_8xi64_zero_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_zero_masked_shuff_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 define <8 x i64> @test_8xi64_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2) {
-; CHECK-LABEL: test_8xi64_shuff_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi64_shuff_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,6,7],zmm1[4,5,2,3]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_shuff_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
   ret <8 x i64> %res
 }
 define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xi64_masked_shuff_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi64_masked_shuff_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3]
+; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_masked_shuff_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
@@ -8028,34 +12511,54 @@ define <8 x i64> @test_8xi64_masked_shuf
 }
 
 define <8 x i64> @test_8xi64_zero_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_zero_masked_shuff_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
   ret <8 x i64> %res
 }
 define <8 x i64> @test_8xi64_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p) {
-; CHECK-LABEL: test_8xi64_shuff_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi64_shuff_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[4,5,2,3]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_shuff_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
   ret <8 x i64> %res
 }
 define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_masked_shuff_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -8064,12 +12567,19 @@ define <8 x i64> @test_8xi64_masked_shuf
 }
 
 define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -8078,13 +12588,21 @@ define <8 x i64> @test_8xi64_zero_masked
 }
 
 define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_masked_shuff_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -8093,12 +12611,19 @@ define <8 x i64> @test_8xi64_masked_shuf
 }
 
 define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -8107,13 +12632,21 @@ define <8 x i64> @test_8xi64_zero_masked
 }
 
 define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_masked_shuff_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 10, i32 11, i32 10, i32 11>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -8122,12 +12655,19 @@ define <8 x i64> @test_8xi64_masked_shuf
 }
 
 define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 10, i32 11, i32 10, i32 11>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -8136,22 +12676,35 @@ define <8 x i64> @test_8xi64_zero_masked
 }
 
 define <8 x i64> @test_8xi64_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p) {
-; CHECK-LABEL: test_8xi64_shuff_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi64_shuff_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,2,3]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_shuff_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
   ret <8 x i64> %res
 }
 define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3]
+; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_masked_shuff_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
+; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -8160,12 +12713,19 @@ define <8 x i64> @test_8xi64_masked_shuf
 }
 
 define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
-; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1
+; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x i64>, <8 x i64>* %vec2p
   %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
   %cmp = icmp eq <8 x i64> %mask, zeroinitializer
@@ -8174,21 +12734,34 @@ define <8 x i64> @test_8xi64_zero_masked
 }
 
 define <4 x float> @test_4xfloat_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2) {
-; CHECK-LABEL: test_4xfloat_unpack_low_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xfloat_unpack_low_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_unpack_low_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   ret <4 x float> %res
 }
 define <4 x float> @test_4xfloat_masked_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
-; CHECK-LABEL: test_4xfloat_masked_unpack_low_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_masked_unpack_low_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
@@ -8196,25 +12769,40 @@ define <4 x float> @test_4xfloat_masked_
 }
 
 define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
-; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
 define <4 x float> @test_4xfloat_masked_unpack_low_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
-; CHECK-LABEL: test_4xfloat_masked_unpack_low_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_masked_unpack_low_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
@@ -8222,25 +12810,40 @@ define <4 x float> @test_4xfloat_masked_
 }
 
 define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
-; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
 define <4 x float> @test_4xfloat_masked_unpack_low_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
-; CHECK-LABEL: test_4xfloat_masked_unpack_low_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_masked_unpack_low_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
@@ -8248,33 +12851,53 @@ define <4 x float> @test_4xfloat_masked_
 }
 
 define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
-; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
 define <4 x float> @test_4xfloat_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2) {
-; CHECK-LABEL: test_4xfloat_unpack_low_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xfloat_unpack_low_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_unpack_low_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   ret <4 x float> %res
 }
 define <4 x float> @test_4xfloat_masked_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
-; CHECK-LABEL: test_4xfloat_masked_unpack_low_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_masked_unpack_low_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
@@ -8282,34 +12905,54 @@ define <4 x float> @test_4xfloat_masked_
 }
 
 define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
-; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
   %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
   ret <4 x float> %res
 }
 define <4 x float> @test_4xfloat_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p) {
-; CHECK-LABEL: test_4xfloat_unpack_low_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xfloat_unpack_low_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_unpack_low_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   ret <4 x float> %res
 }
 define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
-; CHECK-LABEL: test_4xfloat_masked_unpack_low_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1]
+; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -8318,12 +12961,19 @@ define <4 x float> @test_4xfloat_masked_
 }
 
 define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
-; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -8332,13 +12982,21 @@ define <4 x float> @test_4xfloat_zero_ma
 }
 
 define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
-; CHECK-LABEL: test_4xfloat_masked_unpack_low_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1]
+; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -8347,12 +13005,19 @@ define <4 x float> @test_4xfloat_masked_
 }
 
 define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
-; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -8361,13 +13026,21 @@ define <4 x float> @test_4xfloat_zero_ma
 }
 
 define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
-; CHECK-LABEL: test_4xfloat_masked_unpack_low_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1]
+; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -8376,12 +13049,19 @@ define <4 x float> @test_4xfloat_masked_
 }
 
 define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
-; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -8390,22 +13070,35 @@ define <4 x float> @test_4xfloat_zero_ma
 }
 
 define <4 x float> @test_4xfloat_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p) {
-; CHECK-LABEL: test_4xfloat_unpack_low_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xfloat_unpack_low_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_unpack_low_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   ret <4 x float> %res
 }
 define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
-; CHECK-LABEL: test_4xfloat_masked_unpack_low_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1]
+; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -8414,12 +13107,19 @@ define <4 x float> @test_4xfloat_masked_
 }
 
 define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
-; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <4 x float>, <4 x float>* %vec2p
   %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   %cmp = icmp eq <4 x i32> %mask, zeroinitializer
@@ -8428,21 +13128,34 @@ define <4 x float> @test_4xfloat_zero_ma
 }
 
 define <8 x float> @test_8xfloat_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2) {
-; CHECK-LABEL: test_8xfloat_unpack_low_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_unpack_low_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_unpack_low_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   ret <8 x float> %res
 }
 define <8 x float> @test_8xfloat_masked_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xfloat_masked_unpack_low_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; CHECK-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_unpack_low_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
@@ -8450,25 +13163,40 @@ define <8 x float> @test_8xfloat_masked_
 }
 
 define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 define <8 x float> @test_8xfloat_masked_unpack_low_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xfloat_masked_unpack_low_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; CHECK-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_unpack_low_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
@@ -8476,25 +13204,40 @@ define <8 x float> @test_8xfloat_masked_
 }
 
 define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 define <8 x float> @test_8xfloat_masked_unpack_low_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xfloat_masked_unpack_low_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; CHECK-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_unpack_low_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
@@ -8502,33 +13245,53 @@ define <8 x float> @test_8xfloat_masked_
 }
 
 define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 define <8 x float> @test_8xfloat_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2) {
-; CHECK-LABEL: test_8xfloat_unpack_low_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_unpack_low_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_unpack_low_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   ret <8 x float> %res
 }
 define <8 x float> @test_8xfloat_masked_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xfloat_masked_unpack_low_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; CHECK-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_unpack_low_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
@@ -8536,34 +13299,54 @@ define <8 x float> @test_8xfloat_masked_
 }
 
 define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
   %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
   ret <8 x float> %res
 }
 define <8 x float> @test_8xfloat_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
-; CHECK-LABEL: test_8xfloat_unpack_low_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_unpack_low_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_unpack_low_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   ret <8 x float> %res
 }
 define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xfloat_masked_unpack_low_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -8572,12 +13355,19 @@ define <8 x float> @test_8xfloat_masked_
 }
 
 define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -8586,13 +13376,21 @@ define <8 x float> @test_8xfloat_zero_ma
 }
 
 define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xfloat_masked_unpack_low_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -8601,12 +13399,19 @@ define <8 x float> @test_8xfloat_masked_
 }
 
 define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -8615,13 +13420,21 @@ define <8 x float> @test_8xfloat_zero_ma
 }
 
 define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xfloat_masked_unpack_low_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -8630,12 +13443,19 @@ define <8 x float> @test_8xfloat_masked_
 }
 
 define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -8644,22 +13464,35 @@ define <8 x float> @test_8xfloat_zero_ma
 }
 
 define <8 x float> @test_8xfloat_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
-; CHECK-LABEL: test_8xfloat_unpack_low_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_unpack_low_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_unpack_low_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   ret <8 x float> %res
 }
 define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xfloat_masked_unpack_low_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -8668,12 +13501,19 @@ define <8 x float> @test_8xfloat_masked_
 }
 
 define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
-; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %vec2 = load <8 x float>, <8 x float>* %vec2p
   %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   %cmp = icmp eq <8 x i32> %mask, zeroinitializer
@@ -8682,21 +13522,34 @@ define <8 x float> @test_8xfloat_zero_ma
 }
 
 define <16 x float> @test_16xfloat_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2) {
-; CHECK-LABEL: test_16xfloat_unpack_low_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_unpack_low_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_unpack_low_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   ret <16 x float> %res
 }
 define <16 x float> @test_16xfloat_masked_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xfloat_masked_unpack_low_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; CHECK-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_unpack_low_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
@@ -8704,25 +13557,40 @@ define <16 x float> @test_16xfloat_maske
 }
 
 define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mask0:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask0:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask0:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 define <16 x float> @test_16xfloat_masked_unpack_low_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xfloat_masked_unpack_low_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; CHECK-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_unpack_low_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
@@ -8730,25 +13598,40 @@ define <16 x float> @test_16xfloat_maske
 }
 
 define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mask1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask1:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 define <16 x float> @test_16xfloat_masked_unpack_low_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xfloat_masked_unpack_low_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; CHECK-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_unpack_low_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
@@ -8756,33 +13639,53 @@ define <16 x float> @test_16xfloat_maske
 }
 
 define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mask2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask2:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
   ret <16 x float> %res
 }
 define <16 x float> @test_16xfloat_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2) {
-; CHECK-LABEL: test_16xfloat_unpack_low_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_unpack_low_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_unpack_low_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   ret <16 x float> %res
 }
 define <16 x float> @test_16xfloat_masked_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xfloat_masked_unpack_low_mask3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
-; CHECK-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
-; CHECK-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; CHECK-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
-; CHECK-NEXT:    retq # sched: [7:1.00]
+; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask3:
+; GENERIC:       # BB#0:
+; GENERIC-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; GENERIC-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1
+; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; GENERIC-NEXT:    vmovaps %zmm2, %zmm0
+; GENERIC-NEXT:    retq # sched: [1:1.00]
+;
+; SKX-LABEL: test_16xfloat_masked_unpack_low_mask3:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33]
+; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00]
+; SKX-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
+; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
+; SKX-NEXT:    retq # sched: [7:1.00]
   %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
   %cmp = icmp eq <16 x i32> %mask, zeroinitializer
   %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
@@ -8790,34 +13693,54 @@ define <16 x float> @test_16xfloat_maske
 }
 
 define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
-; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mask3:
-; CHECK:       # BB#0:

[... 4324 lines stripped ...]



More information about the llvm-commits mailing list