[llvm] r353043 - [AsmPrinter] Remove hidden flag -print-schedule.

Andrea Di Biagio via llvm-commits llvm-commits at lists.llvm.org
Mon Feb 4 04:51:26 PST 2019


Removed: llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll?rev=353042&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll (removed)
@@ -1,15629 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx512f,+avx512dq,+avx512bw,+avx512vl | FileCheck %s --check-prefix=GENERIC
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=SKX
-
-; This test is an assembly of avx512 shuffling instructions to check their scheduling
-
-define <16 x i16> @test_16xi16_perm_mask0(<16 x i16> %vec) {
-; GENERIC-LABEL: test_16xi16_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
-; GENERIC-NEXT:    vpermw %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi16_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
-; SKX-NEXT:    vpermw %ymm0, %ymm1, %ymm0 # sched: [6:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
-  ret <16 x i16> %res
-}
-define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_16xi16_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi16_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
-; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_z_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi16_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi16_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
-; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
-  ret <16 x i16> %res
-}
-define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_16xi16_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi16_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
-; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_z_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi16_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi16_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
-; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
-  ret <16 x i16> %res
-}
-define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_16xi16_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi16_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
-; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_z_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi16_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi16_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
-; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
-  ret <16 x i16> %res
-}
-define <16 x i16> @test_16xi16_perm_mask3(<16 x i16> %vec) {
-; GENERIC-LABEL: test_16xi16_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
-; GENERIC-NEXT:    vpermw %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi16_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
-; SKX-NEXT:    vpermw %ymm0, %ymm1, %ymm0 # sched: [6:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
-  ret <16 x i16> %res
-}
-define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_16xi16_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi16_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
-; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_z_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi16_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi16_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
-; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
-  ret <16 x i16> %res
-}
-define <16 x i16> @test_16xi16_perm_mem_mask0(<16 x i16>* %vp) {
-; GENERIC-LABEL: test_16xi16_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
-; GENERIC-NEXT:    vpermw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi16_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
-; SKX-NEXT:    vpermw (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
-  ret <16 x i16> %res
-}
-define <16 x i16> @test_masked_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi16_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
-; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_z_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
-; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi16_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
-; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_z_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
-; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi16_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
-; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_z_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
-; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_16xi16_perm_mem_mask3(<16 x i16>* %vp) {
-; GENERIC-LABEL: test_16xi16_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
-; GENERIC-NEXT:    vpermw (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi16_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
-; SKX-NEXT:    vpermw (%rdi), %ymm0, %ymm0 # sched: [13:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
-  ret <16 x i16> %res
-}
-define <16 x i16> @test_masked_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi16_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
-; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_z_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
-; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
-  ret <16 x i16> %res
-}
-
-define <32 x i16> @test_32xi16_perm_mask0(<32 x i16> %vec) {
-; GENERIC-LABEL: test_32xi16_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [7:0.50]
-; GENERIC-NEXT:    vpermw %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_32xi16_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50]
-; SKX-NEXT:    vpermw %zmm0, %zmm1, %zmm0 # sched: [6:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
-  ret <32 x i16> %res
-}
-define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_32xi16_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi16_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50]
-; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi16_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi16_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50]
-; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
-  ret <32 x i16> %res
-}
-define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_32xi16_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi16_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [8:0.50]
-; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi16_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi16_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [8:0.50]
-; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
-  ret <32 x i16> %res
-}
-define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_32xi16_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi16_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [8:0.50]
-; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi16_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi16_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [8:0.50]
-; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
-  ret <32 x i16> %res
-}
-define <32 x i16> @test_32xi16_perm_mask3(<32 x i16> %vec) {
-; GENERIC-LABEL: test_32xi16_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [7:0.50]
-; GENERIC-NEXT:    vpermw %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_32xi16_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50]
-; SKX-NEXT:    vpermw %zmm0, %zmm1, %zmm0 # sched: [6:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
-  ret <32 x i16> %res
-}
-define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_32xi16_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi16_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50]
-; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi16_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi16_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50]
-; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
-  ret <32 x i16> %res
-}
-define <32 x i16> @test_32xi16_perm_mem_mask0(<32 x i16>* %vp) {
-; GENERIC-LABEL: test_32xi16_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [7:0.50]
-; GENERIC-NEXT:    vpermw (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_32xi16_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50]
-; SKX-NEXT:    vpermw (%rdi), %zmm0, %zmm0 # sched: [13:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
-  ret <32 x i16> %res
-}
-define <32 x i16> @test_masked_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi16_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50]
-; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50]
-; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi16_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [8:0.50]
-; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 31, i32 20, i32 2, i32 2, i32 23, i32 1, i32 0, i32 12, i32 16, i32 14, i32 15, i32 18, i32 21, i32 13, i32 11, i32 31, i32 8, i32 24, i32 13, i32 11, i32 2, i32 27, i32 22, i32 28, i32 14, i32 21, i32 3, i32 12, i32 6, i32 1, i32 30, i32 6>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [8:0.50]
-; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 31, i32 20, i32 2, i32 2, i32 23, i32 1, i32 0, i32 12, i32 16, i32 14, i32 15, i32 18, i32 21, i32 13, i32 11, i32 31, i32 8, i32 24, i32 13, i32 11, i32 2, i32 27, i32 22, i32 28, i32 14, i32 21, i32 3, i32 12, i32 6, i32 1, i32 30, i32 6>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi16_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [8:0.50]
-; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 4, i32 6, i32 12, i32 17, i32 4, i32 31, i32 31, i32 4, i32 12, i32 21, i32 28, i32 15, i32 29, i32 10, i32 15, i32 15, i32 21, i32 6, i32 19, i32 7, i32 10, i32 30, i32 28, i32 26, i32 1, i32 4, i32 8, i32 25, i32 26, i32 18, i32 22, i32 25>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [8:0.50]
-; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 4, i32 6, i32 12, i32 17, i32 4, i32 31, i32 31, i32 4, i32 12, i32 21, i32 28, i32 15, i32 29, i32 10, i32 15, i32 15, i32 21, i32 6, i32 19, i32 7, i32 10, i32 30, i32 28, i32 26, i32 1, i32 4, i32 8, i32 25, i32 26, i32 18, i32 22, i32 25>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_32xi16_perm_mem_mask3(<32 x i16>* %vp) {
-; GENERIC-LABEL: test_32xi16_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [7:0.50]
-; GENERIC-NEXT:    vpermw (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_32xi16_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50]
-; SKX-NEXT:    vpermw (%rdi), %zmm0, %zmm0 # sched: [13:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
-  ret <32 x i16> %res
-}
-define <32 x i16> @test_masked_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi16_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50]
-; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_z_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50]
-; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
-  ret <32 x i16> %res
-}
-
-define <8 x i32> @test_8xi32_perm_mask0(<8 x i32> %vec) {
-; GENERIC-LABEL: test_8xi32_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
-; GENERIC-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi32_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
-; SKX-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
-  ret <8 x i32> %res
-}
-define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_8xi32_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi32_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi32_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi32_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
-  ret <8 x i32> %res
-}
-define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_8xi32_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi32_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi32_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi32_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
-  ret <8 x i32> %res
-}
-define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_8xi32_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi32_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi32_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi32_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
-  ret <8 x i32> %res
-}
-define <8 x i32> @test_8xi32_perm_mask3(<8 x i32> %vec) {
-; GENERIC-LABEL: test_8xi32_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
-; GENERIC-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi32_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
-; SKX-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
-  ret <8 x i32> %res
-}
-define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_8xi32_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi32_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi32_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi32_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
-  ret <8 x i32> %res
-}
-define <8 x i32> @test_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
-; GENERIC-LABEL: test_8xi32_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
-; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi32_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
-; SKX-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i32>, <8 x i32>* %vp
-  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
-  ret <8 x i32> %res
-}
-define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi32_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i32>, <8 x i32>* %vp
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i32>, <8 x i32>* %vp
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi32_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i32>, <8 x i32>* %vp
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 6, i32 1, i32 7, i32 6, i32 7, i32 6, i32 5>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i32>, <8 x i32>* %vp
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 6, i32 1, i32 7, i32 6, i32 7, i32 6, i32 5>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi32_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i32>, <8 x i32>* %vp
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 4, i32 6, i32 1, i32 6, i32 3, i32 6, i32 3>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i32>, <8 x i32>* %vp
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 4, i32 6, i32 1, i32 6, i32 3, i32 6, i32 3>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test_8xi32_perm_mem_mask3(<8 x i32>* %vp) {
-; GENERIC-LABEL: test_8xi32_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
-; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi32_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
-; SKX-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i32>, <8 x i32>* %vp
-  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
-  ret <8 x i32> %res
-}
-define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi32_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i32>, <8 x i32>* %vp
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i32>, <8 x i32>* %vp
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
-  ret <8 x i32> %res
-}
-
-define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xi32_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [7:0.50]
-; GENERIC-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi32_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
-; SKX-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
-  ret <16 x i32> %res
-}
-define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_16xi32_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi32_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi32_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi32_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_16xi32_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi32_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi32_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi32_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_16xi32_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi32_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi32_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi32_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) {
-; GENERIC-LABEL: test_16xi32_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [7:0.50]
-; GENERIC-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi32_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
-; SKX-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
-  ret <16 x i32> %res
-}
-define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_16xi32_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi32_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi32_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi32_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-define <16 x i32> @test_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
-; GENERIC-LABEL: test_16xi32_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [7:0.50]
-; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi32_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
-; SKX-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i32>, <16 x i32>* %vp
-  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
-  ret <16 x i32> %res
-}
-define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi32_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i32>, <16 x i32>* %vp
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i32>, <16 x i32>* %vp
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi32_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i32>, <16 x i32>* %vp
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 5, i32 3, i32 4, i32 7, i32 15, i32 12, i32 4, i32 8, i32 11, i32 12, i32 7, i32 6, i32 12, i32 6, i32 3>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i32>, <16 x i32>* %vp
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 5, i32 3, i32 4, i32 7, i32 15, i32 12, i32 4, i32 8, i32 11, i32 12, i32 7, i32 6, i32 12, i32 6, i32 3>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi32_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i32>, <16 x i32>* %vp
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 14, i32 2, i32 7, i32 10, i32 7, i32 3, i32 0, i32 11, i32 9, i32 0, i32 4, i32 12, i32 10, i32 8, i32 2>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i32>, <16 x i32>* %vp
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 14, i32 2, i32 7, i32 10, i32 7, i32 3, i32 0, i32 11, i32 9, i32 0, i32 4, i32 12, i32 10, i32 8, i32 2>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
-; GENERIC-LABEL: test_16xi32_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [7:0.50]
-; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi32_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
-; SKX-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i32>, <16 x i32>* %vp
-  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
-  ret <16 x i32> %res
-}
-define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi32_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i32>, <16 x i32>* %vp
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i32>, <16 x i32>* %vp
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-
-define <4 x i64> @test_4xi64_perm_mask0(<4 x i64> %vec) {
-; GENERIC-LABEL: test_4xi64_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi64_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
-  ret <4 x i64> %res
-}
-define <4 x i64> @test_masked_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_4xi64_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_4xi64_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
-  ret <4 x i64> %res
-}
-
-define <4 x i64> @test_masked_z_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_4xi64_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_4xi64_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
-  ret <4 x i64> %res
-}
-define <4 x i64> @test_masked_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_4xi64_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_4xi64_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
-  ret <4 x i64> %res
-}
-
-define <4 x i64> @test_masked_z_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_4xi64_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_4xi64_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
-  ret <4 x i64> %res
-}
-define <4 x i64> @test_masked_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_4xi64_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_4xi64_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
-  ret <4 x i64> %res
-}
-
-define <4 x i64> @test_masked_z_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_4xi64_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_4xi64_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
-  ret <4 x i64> %res
-}
-define <4 x i64> @test_4xi64_perm_mask3(<4 x i64> %vec) {
-; GENERIC-LABEL: test_4xi64_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi64_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
-  ret <4 x i64> %res
-}
-define <4 x i64> @test_masked_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_4xi64_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_4xi64_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
-  ret <4 x i64> %res
-}
-
-define <4 x i64> @test_masked_z_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_4xi64_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_4xi64_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
-  ret <4 x i64> %res
-}
-define <4 x i64> @test_4xi64_perm_mem_mask0(<4 x i64>* %vp) {
-; GENERIC-LABEL: test_4xi64_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,1,2,0] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi64_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,1,2,0] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x i64>, <4 x i64>* %vp
-  %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
-  ret <4 x i64> %res
-}
-define <4 x i64> @test_masked_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_4xi64_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x i64>, <4 x i64>* %vp
-  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
-  ret <4 x i64> %res
-}
-
-define <4 x i64> @test_masked_z_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x i64>, <4 x i64>* %vp
-  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
-  ret <4 x i64> %res
-}
-
-define <4 x i64> @test_masked_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_4xi64_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x i64>, <4 x i64>* %vp
-  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
-  ret <4 x i64> %res
-}
-
-define <4 x i64> @test_masked_z_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x i64>, <4 x i64>* %vp
-  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
-  ret <4 x i64> %res
-}
-
-define <4 x i64> @test_masked_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_4xi64_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x i64>, <4 x i64>* %vp
-  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
-  ret <4 x i64> %res
-}
-
-define <4 x i64> @test_masked_z_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x i64>, <4 x i64>* %vp
-  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
-  ret <4 x i64> %res
-}
-
-define <4 x i64> @test_4xi64_perm_mem_mask3(<4 x i64>* %vp) {
-; GENERIC-LABEL: test_4xi64_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,0,1,3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi64_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,0,1,3] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x i64>, <4 x i64>* %vp
-  %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
-  ret <4 x i64> %res
-}
-define <4 x i64> @test_masked_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_4xi64_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x i64>, <4 x i64>* %vp
-  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
-  ret <4 x i64> %res
-}
-
-define <4 x i64> @test_masked_z_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x i64>, <4 x i64>* %vp
-  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
-  ret <4 x i64> %res
-}
-
-define <8 x i64> @test_8xi64_perm_mask0(<8 x i64> %vec) {
-; GENERIC-LABEL: test_8xi64_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] sched: [7:0.50]
-; GENERIC-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi64_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] sched: [8:0.50]
-; SKX-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xi64_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi64_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi64_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi64_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi64_perm_imm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_masked_z_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xi64_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi64_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_masked_z_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi64_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi64_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_8xi64_perm_imm_mask3(<8 x i64> %vec) {
-; GENERIC-LABEL: test_8xi64_perm_imm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi64_perm_imm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi64_perm_imm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_masked_z_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xi64_perm_mask4:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi64_perm_mask4:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi64_perm_mask4:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi64_perm_mask4:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask5:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi64_perm_imm_mask5:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_masked_z_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask5:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask5:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_8xi64_perm_mask6(<8 x i64> %vec) {
-; GENERIC-LABEL: test_8xi64_perm_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] sched: [7:0.50]
-; GENERIC-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi64_perm_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] sched: [8:0.50]
-; SKX-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xi64_perm_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi64_perm_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi64_perm_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi64_perm_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask7:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi64_perm_imm_mask7:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_masked_z_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask7:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask7:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_8xi64_perm_mem_mask0(<8 x i64>* %vp) {
-; GENERIC-LABEL: test_8xi64_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [7:0.50]
-; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi64_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [8:0.50]
-; SKX-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i64>, <8 x i64>* %vp
-  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_masked_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi64_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i64>, <8 x i64>* %vp
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i64>, <8 x i64>* %vp
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i64>, <8 x i64>* %vp
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 0, i32 5, i32 5, i32 5, i32 4>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i64>, <8 x i64>* %vp
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 0, i32 5, i32 5, i32 5, i32 4>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_masked_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi64_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i64>, <8 x i64>* %vp
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 1, i32 4, i32 1, i32 1, i32 5, i32 5>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_masked_z_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i64>, <8 x i64>* %vp
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 1, i32 4, i32 1, i32 1, i32 5, i32 5>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp) {
-; GENERIC-LABEL: test_8xi64_perm_imm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi64_perm_imm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i64>, <8 x i64>* %vp
-  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i64>, <8 x i64>* %vp
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i64>, <8 x i64>* %vp
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_masked_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask4:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi64_perm_mem_mask4:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i64>, <8 x i64>* %vp
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 0, i32 7, i32 0, i32 3, i32 5, i32 0, i32 6>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask4:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask4:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i64>, <8 x i64>* %vp
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 0, i32 7, i32 0, i32 3, i32 5, i32 0, i32 6>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask5:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask5:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i64>, <8 x i64>* %vp
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 0, i32 0, i32 7, i32 5, i32 4, i32 4>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i64>, <8 x i64>* %vp
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 0, i32 0, i32 7, i32 5, i32 4, i32 4>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_8xi64_perm_mem_mask6(<8 x i64>* %vp) {
-; GENERIC-LABEL: test_8xi64_perm_mem_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [7:0.50]
-; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi64_perm_mem_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [8:0.50]
-; SKX-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i64>, <8 x i64>* %vp
-  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_masked_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi64_perm_mem_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i64>, <8 x i64>* %vp
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i64>, <8 x i64>* %vp
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask7:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask7:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i64>, <8 x i64>* %vp
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 1, i32 7, i32 4, i32 4, i32 5>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i64>, <8 x i64>* %vp
-  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 1, i32 7, i32 4, i32 4, i32 5>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-
-define <8 x float> @test_8xfloat_perm_mask0(<8 x float> %vec) {
-; GENERIC-LABEL: test_8xfloat_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
-; GENERIC-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
-; SKX-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
-  ret <8 x float> %res
-}
-define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_8xfloat_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xfloat_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_masked_z_8xfloat_perm_mask0(<8 x float> %vec, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xfloat_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-define <8 x float> @test_masked_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_8xfloat_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xfloat_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_masked_z_8xfloat_perm_mask1(<8 x float> %vec, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xfloat_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] sched: [7:0.50]
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_8xfloat_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xfloat_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_masked_z_8xfloat_perm_mask2(<8 x float> %vec, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xfloat_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-define <8 x float> @test_8xfloat_perm_mask3(<8 x float> %vec) {
-; GENERIC-LABEL: test_8xfloat_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
-; GENERIC-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
-; SKX-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
-  ret <8 x float> %res
-}
-define <8 x float> @test_masked_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_8xfloat_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xfloat_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_masked_z_8xfloat_perm_mask3(<8 x float> %vec, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xfloat_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-define <8 x float> @test_8xfloat_perm_mem_mask0(<8 x float>* %vp) {
-; GENERIC-LABEL: test_8xfloat_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
-; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
-; SKX-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x float>, <8 x float>* %vp
-  %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
-  ret <8 x float> %res
-}
-define <8 x float> @test_masked_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x float> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xfloat_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x float>, <8 x float>* %vp
-  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_masked_z_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x float>, <8 x float>* %vp
-  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_masked_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x float> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xfloat_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x float>, <8 x float>* %vp
-  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 4, i32 0, i32 6, i32 6, i32 6>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_masked_z_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x float>, <8 x float>* %vp
-  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 4, i32 0, i32 6, i32 6, i32 6>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_masked_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x float> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xfloat_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x float>, <8 x float>* %vp
-  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 1, i32 5, i32 6, i32 6, i32 2, i32 4>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_masked_z_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x float>, <8 x float>* %vp
-  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 1, i32 5, i32 6, i32 6, i32 2, i32 4>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
-; GENERIC-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
-; SKX-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x float>, <8 x float>* %vp
-  %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
-  ret <8 x float> %res
-}
-define <8 x float> @test_masked_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x float> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xfloat_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x float>, <8 x float>* %vp
-  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_masked_z_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [7:0.50]
-; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x float>, <8 x float>* %vp
-  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-
-define <16 x float> @test_16xfloat_perm_mask0(<16 x float> %vec) {
-; GENERIC-LABEL: test_16xfloat_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [7:0.50]
-; GENERIC-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50]
-; SKX-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
-  ret <16 x float> %res
-}
-define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_16xfloat_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xfloat_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xfloat_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_16xfloat_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xfloat_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 4, i32 10, i32 4, i32 5, i32 8, i32 11, i32 2, i32 0, i32 10, i32 0, i32 0, i32 3, i32 10, i32 1>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_masked_z_16xfloat_perm_mask1(<16 x float> %vec, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xfloat_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 4, i32 10, i32 4, i32 5, i32 8, i32 11, i32 2, i32 0, i32 10, i32 0, i32 0, i32 3, i32 10, i32 1>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_16xfloat_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xfloat_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 15, i32 6, i32 14, i32 3, i32 6, i32 5, i32 2, i32 5, i32 15, i32 11, i32 6, i32 6, i32 4, i32 8, i32 11>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xfloat_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 15, i32 6, i32 14, i32 3, i32 6, i32 5, i32 2, i32 5, i32 15, i32 11, i32 6, i32 6, i32 4, i32 8, i32 11>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-define <16 x float> @test_16xfloat_perm_mask3(<16 x float> %vec) {
-; GENERIC-LABEL: test_16xfloat_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [7:0.50]
-; GENERIC-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50]
-; SKX-NEXT:    vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
-  ret <16 x float> %res
-}
-define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_16xfloat_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xfloat_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_masked_z_16xfloat_perm_mask3(<16 x float> %vec, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xfloat_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-define <16 x float> @test_16xfloat_perm_mem_mask0(<16 x float>* %vp) {
-; GENERIC-LABEL: test_16xfloat_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [7:0.50]
-; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50]
-; SKX-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x float>, <16 x float>* %vp
-  %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
-  ret <16 x float> %res
-}
-define <16 x float> @test_masked_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xfloat_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x float>, <16 x float>* %vp
-  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x float>, <16 x float>* %vp
-  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_masked_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xfloat_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x float>, <16 x float>* %vp
-  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 4, i32 2, i32 3, i32 5, i32 11, i32 6, i32 4, i32 7, i32 6, i32 4, i32 14, i32 8, i32 15, i32 12, i32 9, i32 4>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x float>, <16 x float>* %vp
-  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 4, i32 2, i32 3, i32 5, i32 11, i32 6, i32 4, i32 7, i32 6, i32 4, i32 14, i32 8, i32 15, i32 12, i32 9, i32 4>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_masked_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xfloat_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x float>, <16 x float>* %vp
-  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 11, i32 6, i32 7, i32 0, i32 11, i32 0, i32 10, i32 9, i32 12, i32 4, i32 10, i32 3, i32 8, i32 5>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x float>, <16 x float>* %vp
-  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 11, i32 6, i32 7, i32 0, i32 11, i32 0, i32 10, i32 9, i32 12, i32 4, i32 10, i32 3, i32 8, i32 5>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_perm_mem_mask3(<16 x float>* %vp) {
-; GENERIC-LABEL: test_16xfloat_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [7:0.50]
-; GENERIC-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50]
-; SKX-NEXT:    vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x float>, <16 x float>* %vp
-  %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
-  ret <16 x float> %res
-}
-define <16 x float> @test_masked_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xfloat_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x float>, <16 x float>* %vp
-  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_masked_z_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50]
-; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x float>, <16 x float>* %vp
-  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-
-define <4 x double> @test_4xdouble_perm_mask0(<4 x double> %vec) {
-; GENERIC-LABEL: test_4xdouble_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,2] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,2] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
-  ret <4 x double> %res
-}
-define <4 x double> @test_masked_4xdouble_perm_mask0(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_4xdouble_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_4xdouble_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_masked_z_4xdouble_perm_mask0(<4 x double> %vec, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_4xdouble_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-define <4 x double> @test_masked_4xdouble_perm_mask1(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_4xdouble_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_4xdouble_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_masked_z_4xdouble_perm_mask1(<4 x double> %vec, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_4xdouble_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-define <4 x double> @test_masked_4xdouble_perm_mask2(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_4xdouble_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_4xdouble_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_masked_z_4xdouble_perm_mask2(<4 x double> %vec, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_4xdouble_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-define <4 x double> @test_4xdouble_perm_mask3(<4 x double> %vec) {
-; GENERIC-LABEL: test_4xdouble_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,2] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,2] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
-  ret <4 x double> %res
-}
-define <4 x double> @test_masked_4xdouble_perm_mask3(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_4xdouble_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_4xdouble_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_masked_z_4xdouble_perm_mask3(<4 x double> %vec, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_4xdouble_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-define <4 x double> @test_4xdouble_perm_mem_mask0(<4 x double>* %vp) {
-; GENERIC-LABEL: test_4xdouble_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = mem[0,0,2,0] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = mem[0,0,2,0] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x double>, <4 x double>* %vp
-  %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
-  ret <4 x double> %res
-}
-define <4 x double> @test_masked_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_4xdouble_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x double>, <4 x double>* %vp
-  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_masked_z_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x double>, <4 x double>* %vp
-  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_masked_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_4xdouble_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x double>, <4 x double>* %vp
-  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 2>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_masked_z_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x double>, <4 x double>* %vp
-  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 2>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_masked_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_4xdouble_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x double>, <4 x double>* %vp
-  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_masked_z_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x double>, <4 x double>* %vp
-  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_perm_mem_mask3(<4 x double>* %vp) {
-; GENERIC-LABEL: test_4xdouble_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 = mem[3,2,3,2] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpermpd {{.*#+}} ymm0 = mem[3,2,3,2] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x double>, <4 x double>* %vp
-  %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
-  ret <4 x double> %res
-}
-define <4 x double> @test_masked_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_4xdouble_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x double>, <4 x double>* %vp
-  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_masked_z_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x double>, <4 x double>* %vp
-  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-
-define <8 x double> @test_8xdouble_perm_mask0(<8 x double> %vec) {
-; GENERIC-LABEL: test_8xdouble_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] sched: [7:0.50]
-; GENERIC-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] sched: [8:0.50]
-; SKX-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
-  ret <8 x double> %res
-}
-define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xdouble_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xdouble_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_masked_z_8xdouble_perm_mask0(<8 x double> %vec, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xdouble_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-define <8 x double> @test_masked_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xdouble_perm_imm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_masked_z_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xdouble_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xdouble_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_masked_z_8xdouble_perm_mask2(<8 x double> %vec, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xdouble_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-define <8 x double> @test_8xdouble_perm_imm_mask3(<8 x double> %vec) {
-; GENERIC-LABEL: test_8xdouble_perm_imm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_perm_imm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
-  ret <8 x double> %res
-}
-define <8 x double> @test_masked_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xdouble_perm_imm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_masked_z_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xdouble_perm_mask4:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xdouble_perm_mask4:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_masked_z_8xdouble_perm_mask4(<8 x double> %vec, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask4:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xdouble_perm_mask4:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-define <8 x double> @test_masked_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask5:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xdouble_perm_imm_mask5:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_masked_z_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask5:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask5:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-define <8 x double> @test_8xdouble_perm_mask6(<8 x double> %vec) {
-; GENERIC-LABEL: test_8xdouble_perm_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] sched: [7:0.50]
-; GENERIC-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_perm_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] sched: [8:0.50]
-; SKX-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
-  ret <8 x double> %res
-}
-define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xdouble_perm_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xdouble_perm_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_masked_z_8xdouble_perm_mask6(<8 x double> %vec, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xdouble_perm_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-define <8 x double> @test_masked_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask7:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xdouble_perm_imm_mask7:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_masked_z_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask7:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask7:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-define <8 x double> @test_8xdouble_perm_mem_mask0(<8 x double>* %vp) {
-; GENERIC-LABEL: test_8xdouble_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [7:0.50]
-; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [8:0.50]
-; SKX-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x double>, <8 x double>* %vp
-  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
-  ret <8 x double> %res
-}
-define <8 x double> @test_masked_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xdouble_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x double>, <8 x double>* %vp
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_masked_z_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x double>, <8 x double>* %vp
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x double>, <8 x double>* %vp
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 2, i32 0, i32 3, i32 4, i32 6, i32 4, i32 7>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x double>, <8 x double>* %vp
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 2, i32 0, i32 3, i32 4, i32 6, i32 4, i32 7>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_masked_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xdouble_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x double>, <8 x double>* %vp
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 6, i32 7, i32 2, i32 7, i32 7, i32 6, i32 2, i32 5>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_masked_z_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x double>, <8 x double>* %vp
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 6, i32 7, i32 2, i32 7, i32 7, i32 6, i32 2, i32 5>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp) {
-; GENERIC-LABEL: test_8xdouble_perm_imm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_perm_imm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x double>, <8 x double>* %vp
-  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
-  ret <8 x double> %res
-}
-define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x double>, <8 x double>* %vp
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x double>, <8 x double>* %vp
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_masked_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask4:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xdouble_perm_mem_mask4:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x double>, <8 x double>* %vp
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 5, i32 6, i32 0, i32 6, i32 0>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_masked_z_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask4:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask4:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x double>, <8 x double>* %vp
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 5, i32 6, i32 0, i32 6, i32 0>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask5:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask5:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x double>, <8 x double>* %vp
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 3, i32 6, i32 6, i32 6, i32 7>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x double>, <8 x double>* %vp
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 3, i32 6, i32 6, i32 6, i32 7>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_perm_mem_mask6(<8 x double>* %vp) {
-; GENERIC-LABEL: test_8xdouble_perm_mem_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [7:0.50]
-; GENERIC-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_perm_mem_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [8:0.50]
-; SKX-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x double>, <8 x double>* %vp
-  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
-  ret <8 x double> %res
-}
-define <8 x double> @test_masked_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xdouble_perm_mem_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x double>, <8 x double>* %vp
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_masked_z_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [8:0.50]
-; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x double>, <8 x double>* %vp
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask7:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask7:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x double>, <8 x double>* %vp
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x i64> %mask) {
-; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x double>, <8 x double>* %vp
-  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-
-define <16 x i8> @test_16xi8_perm_mask0(<16 x i8> %vec) {
-; GENERIC-LABEL: test_16xi8_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi8_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
-  ret <16 x i8> %res
-}
-define <16 x i8> @test_masked_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
-; GENERIC-LABEL: test_masked_16xi8_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
-; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi8_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
-  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_masked_z_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi8_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi8_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
-  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
-  ret <16 x i8> %res
-}
-define <16 x i8> @test_masked_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
-; GENERIC-LABEL: test_masked_16xi8_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
-; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi8_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
-  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_masked_z_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi8_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi8_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
-  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
-  ret <16 x i8> %res
-}
-define <16 x i8> @test_masked_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
-; GENERIC-LABEL: test_masked_16xi8_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
-; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi8_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
-  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_masked_z_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi8_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi8_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
-  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
-  ret <16 x i8> %res
-}
-define <16 x i8> @test_16xi8_perm_mask3(<16 x i8> %vec) {
-; GENERIC-LABEL: test_16xi8_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi8_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
-  ret <16 x i8> %res
-}
-define <16 x i8> @test_masked_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) {
-; GENERIC-LABEL: test_masked_16xi8_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
-; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi8_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmb %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
-  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_masked_z_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi8_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi8_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
-  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
-  ret <16 x i8> %res
-}
-define <16 x i8> @test_16xi8_perm_mem_mask0(<16 x i8>* %vp) {
-; GENERIC-LABEL: test_16xi8_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi8_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
-; SKX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i8>, <16 x i8>* %vp
-  %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
-  ret <16 x i8> %res
-}
-define <16 x i8> @test_masked_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
-; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi8_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i8>, <16 x i8>* %vp
-  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
-  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_masked_z_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; GENERIC-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; SKX-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i8>, <16 x i8>* %vp
-  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
-  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_masked_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
-; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi8_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i8>, <16 x i8>* %vp
-  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
-  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_masked_z_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; GENERIC-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; SKX-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i8>, <16 x i8>* %vp
-  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
-  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_masked_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
-; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi8_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i8>, <16 x i8>* %vp
-  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
-  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_masked_z_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; GENERIC-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; SKX-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i8>, <16 x i8>* %vp
-  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
-  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_16xi8_perm_mem_mask3(<16 x i8>* %vp) {
-; GENERIC-LABEL: test_16xi8_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi8_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [6:0.50]
-; SKX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i8>, <16 x i8>* %vp
-  %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
-  ret <16 x i8> %res
-}
-define <16 x i8> @test_masked_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) {
-; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; GENERIC-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi8_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa (%rdi), %xmm2 # sched: [6:0.50]
-; SKX-NEXT:    vptestnmb %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i8>, <16 x i8>* %vp
-  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
-  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2
-  ret <16 x i8> %res
-}
-
-define <16 x i8> @test_masked_z_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; GENERIC-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [6:0.50]
-; SKX-NEXT:    vptestnmb %xmm0, %xmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i8>, <16 x i8>* %vp
-  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
-  %cmp = icmp eq <16 x i8> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer
-  ret <16 x i8> %res
-}
-
-define <32 x i8> @test_32xi8_perm_mask0(<32 x i8> %vec) {
-; GENERIC-LABEL: test_32xi8_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_32xi8_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
-  ret <32 x i8> %res
-}
-define <32 x i8> @test_masked_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
-; GENERIC-LABEL: test_masked_32xi8_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:0.50]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi8_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
-  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
-  ret <32 x i8> %res
-}
-
-define <32 x i8> @test_masked_z_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi8_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi8_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
-  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
-  ret <32 x i8> %res
-}
-define <32 x i8> @test_masked_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
-; GENERIC-LABEL: test_masked_32xi8_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:0.50]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi8_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24>
-  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
-  ret <32 x i8> %res
-}
-
-define <32 x i8> @test_masked_z_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi8_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi8_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24>
-  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
-  ret <32 x i8> %res
-}
-define <32 x i8> @test_masked_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
-; GENERIC-LABEL: test_masked_32xi8_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:0.50]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi8_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29>
-  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
-  ret <32 x i8> %res
-}
-
-define <32 x i8> @test_masked_z_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi8_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi8_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29>
-  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
-  ret <32 x i8> %res
-}
-define <32 x i8> @test_32xi8_perm_mask3(<32 x i8> %vec) {
-; GENERIC-LABEL: test_32xi8_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_32xi8_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
-  ret <32 x i8> %res
-}
-define <32 x i8> @test_masked_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) {
-; GENERIC-LABEL: test_masked_32xi8_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:0.50]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi8_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmb %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
-  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
-  ret <32 x i8> %res
-}
-
-define <32 x i8> @test_masked_z_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi8_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi8_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
-  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
-  ret <32 x i8> %res
-}
-define <32 x i8> @test_32xi8_perm_mem_mask0(<32 x i8>* %vp) {
-; GENERIC-LABEL: test_32xi8_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_32xi8_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
-; SKX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i8>, <32 x i8>* %vp
-  %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
-  ret <32 x i8> %res
-}
-define <32 x i8> @test_masked_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
-; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi8_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i8>, <32 x i8>* %vp
-  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
-  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
-  ret <32 x i8> %res
-}
-
-define <32 x i8> @test_masked_z_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; SKX-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i8>, <32 x i8>* %vp
-  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
-  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
-  ret <32 x i8> %res
-}
-
-define <32 x i8> @test_masked_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
-; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi8_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i8>, <32 x i8>* %vp
-  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 15, i32 10, i32 1, i32 1, i32 11, i32 0, i32 0, i32 6, i32 8, i32 7, i32 7, i32 9, i32 10, i32 6, i32 5, i32 15, i32 20, i32 28, i32 22, i32 21, i32 17, i32 29, i32 27, i32 30, i32 23, i32 26, i32 17, i32 22, i32 19, i32 16, i32 31, i32 19>
-  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
-  ret <32 x i8> %res
-}
-
-define <32 x i8> @test_masked_z_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; SKX-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i8>, <32 x i8>* %vp
-  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 15, i32 10, i32 1, i32 1, i32 11, i32 0, i32 0, i32 6, i32 8, i32 7, i32 7, i32 9, i32 10, i32 6, i32 5, i32 15, i32 20, i32 28, i32 22, i32 21, i32 17, i32 29, i32 27, i32 30, i32 23, i32 26, i32 17, i32 22, i32 19, i32 16, i32 31, i32 19>
-  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
-  ret <32 x i8> %res
-}
-
-define <32 x i8> @test_masked_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
-; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi8_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i8>, <32 x i8>* %vp
-  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 6, i32 8, i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7, i32 14, i32 5, i32 7, i32 7, i32 26, i32 19, i32 25, i32 19, i32 21, i32 31, i32 30, i32 29, i32 16, i32 18, i32 20, i32 28, i32 29, i32 25, i32 27, i32 28>
-  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
-  ret <32 x i8> %res
-}
-
-define <32 x i8> @test_masked_z_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; SKX-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i8>, <32 x i8>* %vp
-  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 6, i32 8, i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7, i32 14, i32 5, i32 7, i32 7, i32 26, i32 19, i32 25, i32 19, i32 21, i32 31, i32 30, i32 29, i32 16, i32 18, i32 20, i32 28, i32 29, i32 25, i32 27, i32 28>
-  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
-  ret <32 x i8> %res
-}
-
-define <32 x i8> @test_32xi8_perm_mem_mask3(<32 x i8>* %vp) {
-; GENERIC-LABEL: test_32xi8_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_32xi8_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [7:0.50]
-; SKX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i8>, <32 x i8>* %vp
-  %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
-  ret <32 x i8> %res
-}
-define <32 x i8> @test_masked_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) {
-; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi8_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa (%rdi), %ymm2 # sched: [7:0.50]
-; SKX-NEXT:    vptestnmb %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i8>, <32 x i8>* %vp
-  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
-  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2
-  ret <32 x i8> %res
-}
-
-define <32 x i8> @test_masked_z_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [7:0.50]
-; SKX-NEXT:    vptestnmb %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i8>, <32 x i8>* %vp
-  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
-  %cmp = icmp eq <32 x i8> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer
-  ret <32 x i8> %res
-}
-
-define <64 x i8> @test_64xi8_perm_mask0(<64 x i8> %vec) {
-; GENERIC-LABEL: test_64xi8_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_64xi8_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
-  ret <64 x i8> %res
-}
-define <64 x i8> @test_masked_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
-; GENERIC-LABEL: test_masked_64xi8_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:0.50]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_64xi8_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
-  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
-  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
-  ret <64 x i8> %res
-}
-
-define <64 x i8> @test_masked_z_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %mask) {
-; GENERIC-LABEL: test_masked_z_64xi8_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_64xi8_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
-  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
-  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
-  ret <64 x i8> %res
-}
-define <64 x i8> @test_masked_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
-; GENERIC-LABEL: test_masked_64xi8_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:0.50]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_64xi8_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
-  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
-  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
-  ret <64 x i8> %res
-}
-
-define <64 x i8> @test_masked_z_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %mask) {
-; GENERIC-LABEL: test_masked_z_64xi8_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_64xi8_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
-  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
-  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
-  ret <64 x i8> %res
-}
-define <64 x i8> @test_masked_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
-; GENERIC-LABEL: test_masked_64xi8_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:0.50]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_64xi8_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
-  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
-  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
-  ret <64 x i8> %res
-}
-
-define <64 x i8> @test_masked_z_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %mask) {
-; GENERIC-LABEL: test_masked_z_64xi8_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_64xi8_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
-  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
-  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
-  ret <64 x i8> %res
-}
-define <64 x i8> @test_64xi8_perm_mask3(<64 x i8> %vec) {
-; GENERIC-LABEL: test_64xi8_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_64xi8_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
-  ret <64 x i8> %res
-}
-define <64 x i8> @test_masked_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) {
-; GENERIC-LABEL: test_masked_64xi8_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:0.50]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_64xi8_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmb %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
-  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
-  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
-  ret <64 x i8> %res
-}
-
-define <64 x i8> @test_masked_z_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %mask) {
-; GENERIC-LABEL: test_masked_z_64xi8_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_64xi8_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
-  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
-  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
-  ret <64 x i8> %res
-}
-define <64 x i8> @test_64xi8_perm_mem_mask0(<64 x i8>* %vp) {
-; GENERIC-LABEL: test_64xi8_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [7:0.50]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_64xi8_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [8:0.50]
-; SKX-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <64 x i8>, <64 x i8>* %vp
-  %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
-  ret <64 x i8> %res
-}
-define <64 x i8> @test_masked_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
-; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_64xi8_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
-; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <64 x i8>, <64 x i8>* %vp
-  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
-  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
-  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
-  ret <64 x i8> %res
-}
-
-define <64 x i8> @test_masked_z_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %mask) {
-; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
-; SKX-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <64 x i8>, <64 x i8>* %vp
-  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
-  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
-  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
-  ret <64 x i8> %res
-}
-
-define <64 x i8> @test_masked_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
-; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_64xi8_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
-; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <64 x i8>, <64 x i8>* %vp
-  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 15, i32 6, i32 14, i32 7, i32 5, i32 1, i32 14, i32 12, i32 5, i32 7, i32 5, i32 0, i32 0, i32 5, i32 3, i32 8, i32 19, i32 19, i32 26, i32 27, i32 20, i32 29, i32 20, i32 21, i32 27, i32 16, i32 30, i32 17, i32 23, i32 27, i32 16, i32 28, i32 47, i32 39, i32 33, i32 33, i32 33, i32 44, i32 38, i32 46, i32 39, i32 33, i32 38, i32 44, i32 45, i32 32, i32 34, i32 39, i32 50, i32 61, i32 62, i32 53, i32 54, i32 56, i32 52, i32 56, i32 51, i32 52, i32 55, i32 57, i32 56, i32 52, i32 51, i32 49>
-  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
-  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
-  ret <64 x i8> %res
-}
-
-define <64 x i8> @test_masked_z_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %mask) {
-; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
-; SKX-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <64 x i8>, <64 x i8>* %vp
-  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 15, i32 6, i32 14, i32 7, i32 5, i32 1, i32 14, i32 12, i32 5, i32 7, i32 5, i32 0, i32 0, i32 5, i32 3, i32 8, i32 19, i32 19, i32 26, i32 27, i32 20, i32 29, i32 20, i32 21, i32 27, i32 16, i32 30, i32 17, i32 23, i32 27, i32 16, i32 28, i32 47, i32 39, i32 33, i32 33, i32 33, i32 44, i32 38, i32 46, i32 39, i32 33, i32 38, i32 44, i32 45, i32 32, i32 34, i32 39, i32 50, i32 61, i32 62, i32 53, i32 54, i32 56, i32 52, i32 56, i32 51, i32 52, i32 55, i32 57, i32 56, i32 52, i32 51, i32 49>
-  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
-  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
-  ret <64 x i8> %res
-}
-
-define <64 x i8> @test_masked_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
-; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_64xi8_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
-; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <64 x i8>, <64 x i8>* %vp
-  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 12, i32 1, i32 11, i32 3, i32 4, i32 11, i32 10, i32 11, i32 8, i32 13, i32 1, i32 10, i32 1, i32 11, i32 5, i32 10, i32 27, i32 26, i32 19, i32 29, i32 19, i32 24, i32 26, i32 19, i32 26, i32 20, i32 18, i32 28, i32 24, i32 21, i32 25, i32 16, i32 34, i32 38, i32 47, i32 40, i32 33, i32 44, i32 44, i32 44, i32 41, i32 43, i32 35, i32 43, i32 45, i32 44, i32 37, i32 41, i32 58, i32 62, i32 49, i32 61, i32 56, i32 53, i32 55, i32 48, i32 51, i32 58, i32 58, i32 55, i32 63, i32 55, i32 53, i32 61>
-  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
-  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
-  ret <64 x i8> %res
-}
-
-define <64 x i8> @test_masked_z_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %mask) {
-; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
-; SKX-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <64 x i8>, <64 x i8>* %vp
-  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 12, i32 1, i32 11, i32 3, i32 4, i32 11, i32 10, i32 11, i32 8, i32 13, i32 1, i32 10, i32 1, i32 11, i32 5, i32 10, i32 27, i32 26, i32 19, i32 29, i32 19, i32 24, i32 26, i32 19, i32 26, i32 20, i32 18, i32 28, i32 24, i32 21, i32 25, i32 16, i32 34, i32 38, i32 47, i32 40, i32 33, i32 44, i32 44, i32 44, i32 41, i32 43, i32 35, i32 43, i32 45, i32 44, i32 37, i32 41, i32 58, i32 62, i32 49, i32 61, i32 56, i32 53, i32 55, i32 48, i32 51, i32 58, i32 58, i32 55, i32 63, i32 55, i32 53, i32 61>
-  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
-  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
-  ret <64 x i8> %res
-}
-
-define <64 x i8> @test_64xi8_perm_mem_mask3(<64 x i8>* %vp) {
-; GENERIC-LABEL: test_64xi8_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [7:0.50]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_64xi8_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [8:0.50]
-; SKX-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <64 x i8>, <64 x i8>* %vp
-  %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
-  ret <64 x i8> %res
-}
-define <64 x i8> @test_masked_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) {
-; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_64xi8_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50]
-; SKX-NEXT:    vptestnmb %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <64 x i8>, <64 x i8>* %vp
-  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
-  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
-  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2
-  ret <64 x i8> %res
-}
-
-define <64 x i8> @test_masked_z_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %mask) {
-; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [7:0.50]
-; GENERIC-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50]
-; SKX-NEXT:    vptestnmb %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <64 x i8>, <64 x i8>* %vp
-  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
-  %cmp = icmp eq <64 x i8> %mask, zeroinitializer
-  %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer
-  ret <64 x i8> %res
-}
-
-define <8 x i16> @test_8xi16_perm_high_mask0(<8 x i16> %vec) {
-; GENERIC-LABEL: test_8xi16_perm_high_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,6] sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi16_perm_high_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
-  ret <8 x i16> %res
-}
-define <8 x i16> @test_masked_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_8xi16_perm_high_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:0.50]
-; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi16_perm_high_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_masked_z_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi16_perm_high_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
-  ret <8 x i16> %res
-}
-define <8 x i16> @test_masked_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_8xi16_perm_low_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:0.50]
-; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi16_perm_low_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_masked_z_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi16_perm_low_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
-  ret <8 x i16> %res
-}
-define <8 x i16> @test_masked_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_8xi16_perm_high_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:0.50]
-; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi16_perm_high_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_masked_z_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi16_perm_high_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
-  ret <8 x i16> %res
-}
-define <8 x i16> @test_8xi16_perm_low_mask3(<8 x i16> %vec) {
-; GENERIC-LABEL: test_8xi16_perm_low_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,1,1,4,5,6,7] sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi16_perm_low_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %res
-}
-define <8 x i16> @test_masked_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_8xi16_perm_low_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:0.50]
-; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi16_perm_low_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_masked_z_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi16_perm_low_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
-  ret <8 x i16> %res
-}
-define <8 x i16> @test_masked_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_8xi16_perm_high_mask4:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:0.50]
-; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi16_perm_high_mask4:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_masked_z_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask4:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi16_perm_high_mask4:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
-  ret <8 x i16> %res
-}
-define <8 x i16> @test_masked_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_8xi16_perm_low_mask5:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:0.50]
-; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi16_perm_low_mask5:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_masked_z_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask5:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi16_perm_low_mask5:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
-  ret <8 x i16> %res
-}
-define <8 x i16> @test_8xi16_perm_high_mask6(<8 x i16> %vec) {
-; GENERIC-LABEL: test_8xi16_perm_high_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,5] sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi16_perm_high_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
-  ret <8 x i16> %res
-}
-define <8 x i16> @test_masked_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_8xi16_perm_high_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:0.50]
-; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi16_perm_high_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_masked_z_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi16_perm_high_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
-  ret <8 x i16> %res
-}
-define <8 x i16> @test_masked_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_8xi16_perm_low_mask7:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:0.50]
-; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi16_perm_low_mask7:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_masked_z_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask7:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi16_perm_low_mask7:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
-  ret <8 x i16> %res
-}
-define <8 x i16> @test_8xi16_perm_high_mem_mask0(<8 x i16>* %vp) {
-; GENERIC-LABEL: test_8xi16_perm_high_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,7,4,6] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi16_perm_high_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,7,4,6] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i16>, <8 x i16>* %vp
-  %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
-  ret <8 x i16> %res
-}
-define <8 x i16> @test_masked_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i16>, <8 x i16>* %vp
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i16>, <8 x i16>* %vp
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_masked_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i16>, <8 x i16>* %vp
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i16>, <8 x i16>* %vp
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_masked_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i16>, <8 x i16>* %vp
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 6, i32 5, i32 7>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i16>, <8 x i16>* %vp
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 6, i32 5, i32 7>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_8xi16_perm_low_mem_mask3(<8 x i16>* %vp) {
-; GENERIC-LABEL: test_8xi16_perm_low_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 = mem[3,1,2,0,4,5,6,7] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi16_perm_low_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 = mem[3,1,2,0,4,5,6,7] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i16>, <8 x i16>* %vp
-  %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %res
-}
-define <8 x i16> @test_masked_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i16>, <8 x i16>* %vp
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i16>, <8 x i16>* %vp
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_masked_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask4:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask4:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i16>, <8 x i16>* %vp
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 7, i32 5>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask4:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask4:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i16>, <8 x i16>* %vp
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 7, i32 5>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_masked_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask5:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask5:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i16>, <8 x i16>* %vp
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask5:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask5:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i16>, <8 x i16>* %vp
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_8xi16_perm_high_mem_mask6(<8 x i16>* %vp) {
-; GENERIC-LABEL: test_8xi16_perm_high_mem_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,4,4,4] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi16_perm_high_mem_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,4,4,4] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i16>, <8 x i16>* %vp
-  %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
-  ret <8 x i16> %res
-}
-define <8 x i16> @test_masked_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i16>, <8 x i16>* %vp
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i16>, <8 x i16>* %vp
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_masked_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask7:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask7:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i16>, <8 x i16>* %vp
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 1, i32 4, i32 5, i32 6, i32 7>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2
-  ret <8 x i16> %res
-}
-
-define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask7:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask7:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %xmm0, %xmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i16>, <8 x i16>* %vp
-  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 1, i32 4, i32 5, i32 6, i32 7>
-  %cmp = icmp eq <8 x i16> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer
-  ret <8 x i16> %res
-}
-
-define <16 x i16> @test_16xi16_perm_high_mask0(<16 x i16> %vec) {
-; GENERIC-LABEL: test_16xi16_perm_high_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi16_perm_high_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
-  ret <16 x i16> %res
-}
-define <16 x i16> @test_masked_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_16xi16_perm_high_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi16_perm_high_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_z_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi16_perm_high_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
-  ret <16 x i16> %res
-}
-define <16 x i16> @test_masked_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_16xi16_perm_low_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi16_perm_low_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_z_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi16_perm_low_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
-  ret <16 x i16> %res
-}
-define <16 x i16> @test_masked_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_16xi16_perm_high_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi16_perm_high_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_z_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi16_perm_high_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
-  ret <16 x i16> %res
-}
-define <16 x i16> @test_16xi16_perm_low_mask3(<16 x i16> %vec) {
-; GENERIC-LABEL: test_16xi16_perm_low_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi16_perm_low_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i16> %res
-}
-define <16 x i16> @test_masked_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_16xi16_perm_low_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi16_perm_low_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_z_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi16_perm_low_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
-  ret <16 x i16> %res
-}
-define <16 x i16> @test_masked_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_16xi16_perm_high_mask4:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi16_perm_high_mask4:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_z_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask4:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi16_perm_high_mask4:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
-  ret <16 x i16> %res
-}
-define <16 x i16> @test_masked_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_16xi16_perm_low_mask5:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi16_perm_low_mask5:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_z_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask5:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi16_perm_low_mask5:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
-  ret <16 x i16> %res
-}
-define <16 x i16> @test_16xi16_perm_high_mask6(<16 x i16> %vec) {
-; GENERIC-LABEL: test_16xi16_perm_high_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi16_perm_high_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
-  ret <16 x i16> %res
-}
-define <16 x i16> @test_masked_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_16xi16_perm_high_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi16_perm_high_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_z_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi16_perm_high_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
-  ret <16 x i16> %res
-}
-define <16 x i16> @test_masked_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_16xi16_perm_low_mask7:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi16_perm_low_mask7:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_z_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask7:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi16_perm_low_mask7:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
-  ret <16 x i16> %res
-}
-define <16 x i16> @test_16xi16_perm_high_mem_mask0(<16 x i16>* %vp) {
-; GENERIC-LABEL: test_16xi16_perm_high_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi16_perm_high_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
-  ret <16 x i16> %res
-}
-define <16 x i16> @test_masked_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 13, i32 14>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 13, i32 14>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_16xi16_perm_low_mem_mask3(<16 x i16>* %vp) {
-; GENERIC-LABEL: test_16xi16_perm_low_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi16_perm_low_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x i16> %res
-}
-define <16 x i16> @test_masked_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask4:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask4:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 14, i32 15>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask4:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask4:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 14, i32 15>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask5:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask5:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask5:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask5:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_16xi16_perm_high_mem_mask6(<16 x i16>* %vp) {
-; GENERIC-LABEL: test_16xi16_perm_high_mem_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi16_perm_high_mem_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
-  ret <16 x i16> %res
-}
-define <16 x i16> @test_masked_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask7:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask7:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2
-  ret <16 x i16> %res
-}
-
-define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask7:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask7:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i16>, <16 x i16>* %vp
-  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <16 x i16> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer
-  ret <16 x i16> %res
-}
-
-define <32 x i16> @test_32xi16_perm_high_mask0(<32 x i16> %vec) {
-; GENERIC-LABEL: test_32xi16_perm_high_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_32xi16_perm_high_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
-  ret <32 x i16> %res
-}
-define <32 x i16> @test_masked_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_32xi16_perm_high_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi16_perm_high_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_z_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi16_perm_high_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
-  ret <32 x i16> %res
-}
-define <32 x i16> @test_masked_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_32xi16_perm_low_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi16_perm_low_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_z_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi16_perm_low_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
-  ret <32 x i16> %res
-}
-define <32 x i16> @test_masked_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_32xi16_perm_high_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi16_perm_high_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_z_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi16_perm_high_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
-  ret <32 x i16> %res
-}
-define <32 x i16> @test_32xi16_perm_low_mask3(<32 x i16> %vec) {
-; GENERIC-LABEL: test_32xi16_perm_low_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_32xi16_perm_low_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
-  ret <32 x i16> %res
-}
-define <32 x i16> @test_masked_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_32xi16_perm_low_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi16_perm_low_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_z_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi16_perm_low_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
-  ret <32 x i16> %res
-}
-define <32 x i16> @test_masked_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_32xi16_perm_high_mask4:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi16_perm_high_mask4:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_z_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask4:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi16_perm_high_mask4:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
-  ret <32 x i16> %res
-}
-define <32 x i16> @test_masked_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_32xi16_perm_low_mask5:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi16_perm_low_mask5:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_z_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask5:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi16_perm_low_mask5:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
-  ret <32 x i16> %res
-}
-define <32 x i16> @test_32xi16_perm_high_mask6(<32 x i16> %vec) {
-; GENERIC-LABEL: test_32xi16_perm_high_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_32xi16_perm_high_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
-  ret <32 x i16> %res
-}
-define <32 x i16> @test_masked_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_32xi16_perm_high_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi16_perm_high_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_z_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi16_perm_high_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
-  ret <32 x i16> %res
-}
-define <32 x i16> @test_masked_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_32xi16_perm_low_mask7:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi16_perm_low_mask7:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_z_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask7:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi16_perm_low_mask7:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
-  ret <32 x i16> %res
-}
-define <32 x i16> @test_32xi16_perm_high_mem_mask0(<32 x i16>* %vp) {
-; GENERIC-LABEL: test_32xi16_perm_high_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_32xi16_perm_high_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
-  ret <32 x i16> %res
-}
-define <32 x i16> @test_masked_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 11, i32 11, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 19, i32 19, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 27, i32 27, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 11, i32 11, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 19, i32 19, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 27, i32 27, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 7, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 15, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 23, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 31, i32 30, i32 28>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 7, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 15, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 23, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 31, i32 30, i32 28>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_32xi16_perm_low_mem_mask3(<32 x i16>* %vp) {
-; GENERIC-LABEL: test_32xi16_perm_low_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_32xi16_perm_low_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
-  ret <32 x i16> %res
-}
-define <32 x i16> @test_masked_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask4:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask4:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 14, i32 13, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 22, i32 21, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 30, i32 29>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask4:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask4:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 14, i32 13, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 22, i32 21, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 30, i32 29>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask5:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
-; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask5:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
-; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vmovdqu16 %zmm2, %zmm0 {%k1} # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 16, i32 17, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 24, i32 25, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask5:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
-; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask5:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00]
-; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 16, i32 17, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 24, i32 25, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_32xi16_perm_high_mem_mask6(<32 x i16>* %vp) {
-; GENERIC-LABEL: test_32xi16_perm_high_mem_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_32xi16_perm_high_mem_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
-  ret <32 x i16> %res
-}
-define <32 x i16> @test_masked_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask6:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask6:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask7:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask7:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 1, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 17, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 25, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2
-  ret <32 x i16> %res
-}
-
-define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %mask) {
-; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask7:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask7:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmw %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <32 x i16>, <32 x i16>* %vp
-  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 1, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 17, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 25, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <32 x i16> %mask, zeroinitializer
-  %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer
-  ret <32 x i16> %res
-}
-
-define <4 x i32> @test_4xi32_perm_mask0(<4 x i32> %vec) {
-; GENERIC-LABEL: test_4xi32_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,3,0] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi32_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,3,0] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
-  ret <4 x i32> %res
-}
-define <4 x i32> @test_masked_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
-; GENERIC-LABEL: test_masked_4xi32_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:0.50]
-; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_4xi32_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
-  ret <4 x i32> %res
-}
-
-define <4 x i32> @test_masked_z_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_4xi32_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_4xi32_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
-  ret <4 x i32> %res
-}
-define <4 x i32> @test_masked_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
-; GENERIC-LABEL: test_masked_4xi32_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:0.50]
-; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_4xi32_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
-  ret <4 x i32> %res
-}
-
-define <4 x i32> @test_masked_z_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_4xi32_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_4xi32_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
-  ret <4 x i32> %res
-}
-define <4 x i32> @test_masked_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
-; GENERIC-LABEL: test_masked_4xi32_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:0.50]
-; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_4xi32_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
-  ret <4 x i32> %res
-}
-
-define <4 x i32> @test_masked_z_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_4xi32_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_4xi32_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
-  ret <4 x i32> %res
-}
-define <4 x i32> @test_4xi32_perm_mask3(<4 x i32> %vec) {
-; GENERIC-LABEL: test_4xi32_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi32_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,3] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
-  ret <4 x i32> %res
-}
-define <4 x i32> @test_masked_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
-; GENERIC-LABEL: test_masked_4xi32_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:0.50]
-; GENERIC-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_4xi32_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
-  ret <4 x i32> %res
-}
-
-define <4 x i32> @test_masked_z_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_4xi32_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_4xi32_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
-  ret <4 x i32> %res
-}
-define <4 x i32> @test_4xi32_perm_mem_mask0(<4 x i32>* %vp) {
-; GENERIC-LABEL: test_4xi32_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,3,3] sched: [7:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi32_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,3,3] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x i32>, <4 x i32>* %vp
-  %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
-  ret <4 x i32> %res
-}
-define <4 x i32> @test_masked_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
-; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_4xi32_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x i32>, <4 x i32>* %vp
-  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
-  ret <4 x i32> %res
-}
-
-define <4 x i32> @test_masked_z_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x i32>, <4 x i32>* %vp
-  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
-  ret <4 x i32> %res
-}
-
-define <4 x i32> @test_masked_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
-; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_4xi32_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x i32>, <4 x i32>* %vp
-  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 1>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
-  ret <4 x i32> %res
-}
-
-define <4 x i32> @test_masked_z_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x i32>, <4 x i32>* %vp
-  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 1>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
-  ret <4 x i32> %res
-}
-
-define <4 x i32> @test_masked_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
-; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_4xi32_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x i32>, <4 x i32>* %vp
-  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 0, i32 1>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
-  ret <4 x i32> %res
-}
-
-define <4 x i32> @test_masked_z_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x i32>, <4 x i32>* %vp
-  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 0, i32 1>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
-  ret <4 x i32> %res
-}
-
-define <4 x i32> @test_4xi32_perm_mem_mask3(<4 x i32>* %vp) {
-; GENERIC-LABEL: test_4xi32_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,0,1,0] sched: [7:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi32_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,0,1,0] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x i32>, <4 x i32>* %vp
-  %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
-  ret <4 x i32> %res
-}
-define <4 x i32> @test_masked_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) {
-; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_4xi32_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x i32>, <4 x i32>* %vp
-  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2
-  ret <4 x i32> %res
-}
-
-define <4 x i32> @test_masked_z_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %mask) {
-; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] sched: [7:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm0, %xmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <4 x i32>, <4 x i32>* %vp
-  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer
-  ret <4 x i32> %res
-}
-
-define <8 x i32> @test2_8xi32_perm_mask0(<8 x i32> %vec) {
-; GENERIC-LABEL: test2_8xi32_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_8xi32_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
-  ret <8 x i32> %res
-}
-define <8 x i32> @test2_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_8xi32_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_8xi32_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test2_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_z_8xi32_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
-  ret <8 x i32> %res
-}
-define <8 x i32> @test2_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_8xi32_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_8xi32_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test2_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_z_8xi32_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
-  ret <8 x i32> %res
-}
-define <8 x i32> @test2_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_8xi32_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_8xi32_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test2_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_z_8xi32_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
-  ret <8 x i32> %res
-}
-define <8 x i32> @test2_8xi32_perm_mask3(<8 x i32> %vec) {
-; GENERIC-LABEL: test2_8xi32_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_8xi32_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
-  ret <8 x i32> %res
-}
-define <8 x i32> @test2_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_8xi32_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_8xi32_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test2_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_z_8xi32_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
-  ret <8 x i32> %res
-}
-define <8 x i32> @test2_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
-; GENERIC-LABEL: test2_8xi32_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermilps {{.*#+}} ymm0 = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_8xi32_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpermilps {{.*#+}} ymm0 = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i32>, <8 x i32>* %vp
-  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
-  ret <8 x i32> %res
-}
-define <8 x i32> @test2_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_8xi32_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i32>, <8 x i32>* %vp
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i32>, <8 x i32>* %vp
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test2_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_8xi32_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i32>, <8 x i32>* %vp
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i32>, <8 x i32>* %vp
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test2_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_8xi32_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i32>, <8 x i32>* %vp
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 1, i32 7, i32 6, i32 7, i32 5>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i32>, <8 x i32>* %vp
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 1, i32 7, i32 6, i32 7, i32 5>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test2_8xi32_perm_mem_mask3(<8 x i32>* %vp) {
-; GENERIC-LABEL: test2_8xi32_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermilps {{.*#+}} ymm0 = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_8xi32_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpermilps {{.*#+}} ymm0 = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i32>, <8 x i32>* %vp
-  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
-  ret <8 x i32> %res
-}
-define <8 x i32> @test2_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_8xi32_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i32>, <8 x i32>* %vp
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm0, %ymm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <8 x i32>, <8 x i32>* %vp
-  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
-  ret <8 x i32> %res
-}
-
-define <16 x i32> @test2_16xi32_perm_mask0(<16 x i32> %vec) {
-; GENERIC-LABEL: test2_16xi32_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_16xi32_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
-  ret <16 x i32> %res
-}
-define <16 x i32> @test2_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_16xi32_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_16xi32_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test2_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_z_16xi32_perm_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-define <16 x i32> @test2_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_16xi32_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_16xi32_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test2_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_z_16xi32_perm_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-define <16 x i32> @test2_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_16xi32_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_16xi32_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test2_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_z_16xi32_perm_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-define <16 x i32> @test2_16xi32_perm_mask3(<16 x i32> %vec) {
-; GENERIC-LABEL: test2_16xi32_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_16xi32_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
-  ret <16 x i32> %res
-}
-define <16 x i32> @test2_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_16xi32_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_16xi32_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test2_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_z_16xi32_perm_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-define <16 x i32> @test2_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
-; GENERIC-LABEL: test2_16xi32_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermilps {{.*#+}} zmm0 = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_16xi32_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpermilps {{.*#+}} zmm0 = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i32>, <16 x i32>* %vp
-  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
-  ret <16 x i32> %res
-}
-define <16 x i32> @test2_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_16xi32_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i32>, <16 x i32>* %vp
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i32>, <16 x i32>* %vp
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test2_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_16xi32_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i32>, <16 x i32>* %vp
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 2, i32 5, i32 4, i32 4, i32 6, i32 9, i32 8, i32 8, i32 10, i32 13, i32 12, i32 12, i32 14>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i32>, <16 x i32>* %vp
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 2, i32 5, i32 4, i32 4, i32 6, i32 9, i32 8, i32 8, i32 10, i32 13, i32 12, i32 12, i32 14>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test2_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_16xi32_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i32>, <16 x i32>* %vp
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 1, i32 2, i32 6, i32 4, i32 5, i32 6, i32 10, i32 8, i32 9, i32 10, i32 14, i32 12, i32 13, i32 14>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i32>, <16 x i32>* %vp
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 1, i32 2, i32 6, i32 4, i32 5, i32 6, i32 10, i32 8, i32 9, i32 10, i32 14, i32 12, i32 13, i32 14>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test2_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
-; GENERIC-LABEL: test2_16xi32_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vpermilps {{.*#+}} zmm0 = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_16xi32_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vpermilps {{.*#+}} zmm0 = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i32>, <16 x i32>* %vp
-  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
-  ret <16 x i32> %res
-}
-define <16 x i32> @test2_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_16xi32_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i32>, <16 x i32>* %vp
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) {
-; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm0, %zmm0, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec = load <16 x i32>, <16 x i32>* %vp
-  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-
-define <8 x float> @test2_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2) {
-; GENERIC-LABEL: test2_8xfloat_shuff_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_8xfloat_shuff_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  ret <8 x float> %res
-}
-define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_8xfloat_masked_shuff_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
-; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
-  ret <8 x float> %res
-}
-
-define <8 x float> @test2_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_8xfloat_masked_shuff_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
-; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
-  ret <8 x float> %res
-}
-
-define <8 x float> @test2_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_8xfloat_masked_shuff_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
-; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
-  ret <8 x float> %res
-}
-
-define <8 x float> @test2_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-define <8 x float> @test2_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2) {
-; GENERIC-LABEL: test2_8xfloat_shuff_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_8xfloat_shuff_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  ret <8 x float> %res
-}
-define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test2_8xfloat_masked_shuff_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
-; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_zero_masked_shuff_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
-; GENERIC-LABEL: test_8xfloat_shuff_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_shuff_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-  ret <8 x float> %res
-}
-define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
-; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
-; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
-; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
-; GENERIC-LABEL: test_8xfloat_shuff_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_shuff_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  ret <8 x float> %res
-}
-define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
-; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-
-define <16 x float> @test_16xfloat_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_shuff_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],zmm1[2,3,6,7] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_shuff_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],zmm1[2,3,6,7] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
-  ret <16 x float> %res
-}
-define <16 x float> @test_16xfloat_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_masked_shuff_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_masked_shuff_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
-; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_zero_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-define <16 x float> @test_16xfloat_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_masked_shuff_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_masked_shuff_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [3:1.00]
-; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_zero_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-define <16 x float> @test_16xfloat_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_masked_shuff_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_masked_shuff_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [3:1.00]
-; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_zero_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-define <16 x float> @test_16xfloat_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2) {
-; GENERIC-LABEL: test_16xfloat_shuff_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7],zmm1[0,1,4,5] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_shuff_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7],zmm1[0,1,4,5] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
-  ret <16 x float> %res
-}
-define <16 x float> @test_16xfloat_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_masked_shuff_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_masked_shuff_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00]
-; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_zero_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-define <16 x float> @test_16xfloat_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
-; GENERIC-LABEL: test_16xfloat_shuff_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5],mem[4,5,2,3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_shuff_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5],mem[4,5,2,3] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
-  ret <16 x float> %res
-}
-define <16 x float> @test_16xfloat_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [8:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
-; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [8:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
-; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [8:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [10:1.00]
-; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 24, i32 25, i32 26, i32 27>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 24, i32 25, i32 26, i32 27>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
-; GENERIC-LABEL: test_16xfloat_shuff_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,6,7] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_shuff_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,6,7] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
-  ret <16 x float> %res
-}
-define <16 x float> @test_16xfloat_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [8:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
-; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-
-define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2) {
-; GENERIC-LABEL: test_4xdouble_shuff_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_shuff_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  ret <4 x double> %res
-}
-define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_masked_shuff_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_masked_shuff_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_masked_shuff_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_masked_shuff_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_masked_shuff_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_masked_shuff_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2) {
-; GENERIC-LABEL: test_4xdouble_shuff_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_shuff_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  ret <4 x double> %res
-}
-define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_masked_shuff_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_masked_shuff_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
-; GENERIC-LABEL: test_4xdouble_shuff_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_shuff_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  ret <4 x double> %res
-}
-define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [8:1.00]
-; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [8:1.00]
-; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [8:1.00]
-; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
-; GENERIC-LABEL: test_4xdouble_shuff_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_shuff_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  ret <4 x double> %res
-}
-define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [8:1.00]
-; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-
-define <8 x double> @test_8xdouble_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2) {
-; GENERIC-LABEL: test_8xdouble_shuff_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_shuff_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
-  ret <8 x double> %res
-}
-define <8 x double> @test_8xdouble_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_masked_shuff_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_masked_shuff_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_zero_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-define <8 x double> @test_8xdouble_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_masked_shuff_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_masked_shuff_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_zero_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-define <8 x double> @test_8xdouble_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_masked_shuff_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_masked_shuff_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_zero_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-define <8 x double> @test_8xdouble_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2) {
-; GENERIC-LABEL: test_8xdouble_shuff_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_shuff_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
-  ret <8 x double> %res
-}
-define <8 x double> @test_8xdouble_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_masked_shuff_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_masked_shuff_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_zero_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-define <8 x double> @test_8xdouble_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
-; GENERIC-LABEL: test_8xdouble_shuff_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],mem[0,1,0,1] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_shuff_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
-  ret <8 x double> %res
-}
-define <8 x double> @test_8xdouble_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [8:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [8:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [8:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 12, i32 13>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 12, i32 13>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
-; GENERIC-LABEL: test_8xdouble_shuff_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[4,5,0,1] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_shuff_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
-  ret <8 x double> %res
-}
-define <8 x double> @test_8xdouble_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [8:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-
-define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) {
-; GENERIC-LABEL: test_8xi32_shuff_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi32_shuff_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-  ret <8 x i32> %res
-}
-define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xi32_masked_shuff_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi32_masked_shuff_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi32_zero_masked_shuff_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
-  ret <8 x i32> %res
-}
-define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xi32_masked_shuff_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi32_masked_shuff_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi32_zero_masked_shuff_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
-  ret <8 x i32> %res
-}
-define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xi32_masked_shuff_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi32_masked_shuff_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi32_zero_masked_shuff_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
-  ret <8 x i32> %res
-}
-define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) {
-; GENERIC-LABEL: test_8xi32_shuff_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi32_shuff_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  ret <8 x i32> %res
-}
-define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xi32_masked_shuff_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi32_masked_shuff_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm2 {%k1} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi32_zero_masked_shuff_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[0,1,2,3] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
-  ret <8 x i32> %res
-}
-define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) {
-; GENERIC-LABEL: test_8xi32_shuff_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi32_shuff_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
-  %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-  ret <8 x i32> %res
-}
-define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi32_masked_shuff_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
-  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[4,5,6,7] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
-  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi32_masked_shuff_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
-  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
-  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi32_masked_shuff_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
-  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
-  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) {
-; GENERIC-LABEL: test_8xi32_shuff_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi32_shuff_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
-  %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  ret <8 x i32> %res
-}
-define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi32_masked_shuff_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm1 {%k1} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
-  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3
-  ret <8 x i32> %res
-}
-
-define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],mem[0,1,2,3] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
-  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer
-  ret <8 x i32> %res
-}
-
-define <16 x i32> @test_16xi32_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2) {
-; GENERIC-LABEL: test_16xi32_shuff_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,6,7] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi32_shuff_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,6,7] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
-  ret <16 x i32> %res
-}
-define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xi32_masked_shuff_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi32_masked_shuff_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_16xi32_zero_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi32_zero_masked_shuff_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xi32_masked_shuff_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi32_masked_shuff_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_16xi32_zero_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi32_zero_masked_shuff_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xi32_masked_shuff_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi32_masked_shuff_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_16xi32_zero_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi32_zero_masked_shuff_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-define <16 x i32> @test_16xi32_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2) {
-; GENERIC-LABEL: test_16xi32_shuff_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],zmm1[4,5,2,3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi32_shuff_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],zmm1[4,5,2,3] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
-  ret <16 x i32> %res
-}
-define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xi32_masked_shuff_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi32_masked_shuff_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_16xi32_zero_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi32_zero_masked_shuff_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-define <16 x i32> @test_16xi32_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p) {
-; GENERIC-LABEL: test_16xi32_shuff_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],mem[4,5,0,1] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi32_shuff_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],mem[4,5,0,1] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
-  %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
-  ret <16 x i32> %res
-}
-define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [8:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi32_masked_shuff_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
-  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
-  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [8:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi32_masked_shuff_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
-  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
-  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [8:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi32_masked_shuff_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
-  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
-  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_16xi32_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p) {
-; GENERIC-LABEL: test_16xi32_shuff_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[2,3,6,7] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi32_shuff_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[2,3,6,7] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
-  %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
-  ret <16 x i32> %res
-}
-define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [8:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi32_masked_shuff_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
-  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
-  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer
-  ret <16 x i32> %res
-}
-
-define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) {
-; GENERIC-LABEL: test_4xi64_shuff_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi64_shuff_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  ret <4 x i64> %res
-}
-define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xi64_masked_shuff_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi64_masked_shuff_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
-  ret <4 x i64> %res
-}
-
-define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi64_zero_masked_shuff_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
-  ret <4 x i64> %res
-}
-define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xi64_masked_shuff_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi64_masked_shuff_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
-  ret <4 x i64> %res
-}
-
-define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi64_zero_masked_shuff_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
-  ret <4 x i64> %res
-}
-define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xi64_masked_shuff_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi64_masked_shuff_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
-  ret <4 x i64> %res
-}
-
-define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi64_zero_masked_shuff_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
-  ret <4 x i64> %res
-}
-define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) {
-; GENERIC-LABEL: test_4xi64_shuff_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi64_shuff_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  ret <4 x i64> %res
-}
-define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xi64_masked_shuff_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi64_masked_shuff_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm2 {%k1} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
-  ret <4 x i64> %res
-}
-
-define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi64_zero_masked_shuff_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
-  ret <4 x i64> %res
-}
-define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) {
-; GENERIC-LABEL: test_4xi64_shuff_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi64_shuff_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
-  %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  ret <4 x i64> %res
-}
-define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [8:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi64_masked_shuff_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
-  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
-  ret <4 x i64> %res
-}
-
-define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
-  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
-  ret <4 x i64> %res
-}
-
-define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [8:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi64_masked_shuff_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
-  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
-  ret <4 x i64> %res
-}
-
-define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
-  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
-  ret <4 x i64> %res
-}
-
-define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [8:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi64_masked_shuff_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
-  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
-  ret <4 x i64> %res
-}
-
-define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[0,1] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
-  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
-  ret <4 x i64> %res
-}
-
-define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) {
-; GENERIC-LABEL: test_4xi64_shuff_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi64_shuff_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
-  %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  ret <4 x i64> %res
-}
-define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [8:1.00]
-; GENERIC-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi64_masked_shuff_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm1 {%k1} = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
-  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3
-  ret <4 x i64> %res
-}
-
-define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],mem[2,3] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
-  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
-  ret <4 x i64> %res
-}
-
-define <8 x i64> @test_8xi64_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2) {
-; GENERIC-LABEL: test_8xi64_shuff_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi64_shuff_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xi64_masked_shuff_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi64_masked_shuff_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_8xi64_zero_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi64_zero_masked_shuff_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xi64_masked_shuff_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi64_masked_shuff_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_8xi64_zero_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi64_zero_masked_shuff_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xi64_masked_shuff_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi64_masked_shuff_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_8xi64_zero_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi64_zero_masked_shuff_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_8xi64_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2) {
-; GENERIC-LABEL: test_8xi64_shuff_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi64_shuff_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xi64_masked_shuff_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi64_masked_shuff_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_8xi64_zero_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi64_zero_masked_shuff_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_8xi64_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p) {
-; GENERIC-LABEL: test_8xi64_shuff_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[4,5,2,3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi64_shuff_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
-  %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [8:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi64_masked_shuff_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
-  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
-  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [8:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi64_masked_shuff_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
-  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
-  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [8:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi64_masked_shuff_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
-  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 10, i32 11, i32 10, i32 11>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
-  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 10, i32 11, i32 10, i32 11>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_8xi64_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p) {
-; GENERIC-LABEL: test_8xi64_shuff_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,2,3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi64_shuff_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
-  %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [8:1.00]
-; GENERIC-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi64_masked_shuff_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
-; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
-  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
-  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer
-  ret <8 x i64> %res
-}
-
-define <4 x float> @test_4xfloat_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2) {
-; GENERIC-LABEL: test_4xfloat_unpack_low_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_unpack_low_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-  ret <4 x float> %res
-}
-define <4 x float> @test_4xfloat_masked_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_masked_unpack_low_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
-  ret <4 x float> %res
-}
-define <4 x float> @test_4xfloat_masked_unpack_low_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_masked_unpack_low_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
-  ret <4 x float> %res
-}
-define <4 x float> @test_4xfloat_masked_unpack_low_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_masked_unpack_low_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
-  ret <4 x float> %res
-}
-define <4 x float> @test_4xfloat_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2) {
-; GENERIC-LABEL: test_4xfloat_unpack_low_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_unpack_low_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-  ret <4 x float> %res
-}
-define <4 x float> @test_4xfloat_masked_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_masked_unpack_low_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
-  ret <4 x float> %res
-}
-define <4 x float> @test_4xfloat_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p) {
-; GENERIC-LABEL: test_4xfloat_unpack_low_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_unpack_low_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x float>, <4 x float>* %vec2p
-  %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-  ret <4 x float> %res
-}
-define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x float>, <4 x float>* %vec2p
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x float>, <4 x float>* %vec2p
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x float>, <4 x float>* %vec2p
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x float>, <4 x float>* %vec2p
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x float>, <4 x float>* %vec2p
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x float>, <4 x float>* %vec2p
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_4xfloat_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p) {
-; GENERIC-LABEL: test_4xfloat_unpack_low_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_unpack_low_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x float>, <4 x float>* %vec2p
-  %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-  ret <4 x float> %res
-}
-define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x float>, <4 x float>* %vec2p
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x float>, <4 x float>* %vec2p
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
-  ret <4 x float> %res
-}
-
-define <8 x float> @test_8xfloat_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2) {
-; GENERIC-LABEL: test_8xfloat_unpack_low_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_unpack_low_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
-  ret <8 x float> %res
-}
-define <8 x float> @test_8xfloat_masked_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_masked_unpack_low_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-define <8 x float> @test_8xfloat_masked_unpack_low_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_masked_unpack_low_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-define <8 x float> @test_8xfloat_masked_unpack_low_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_masked_unpack_low_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-define <8 x float> @test_8xfloat_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2) {
-; GENERIC-LABEL: test_8xfloat_unpack_low_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_unpack_low_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
-  ret <8 x float> %res
-}
-define <8 x float> @test_8xfloat_masked_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_masked_unpack_low_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-define <8 x float> @test_8xfloat_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
-; GENERIC-LABEL: test_8xfloat_unpack_low_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_unpack_low_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
-  ret <8 x float> %res
-}
-define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
-; GENERIC-LABEL: test_8xfloat_unpack_low_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_unpack_low_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
-  ret <8 x float> %res
-}
-define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-
-define <16 x float> @test_16xfloat_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2) {
-; GENERIC-LABEL: test_16xfloat_unpack_low_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_unpack_low_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-  ret <16 x float> %res
-}
-define <16 x float> @test_16xfloat_masked_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_masked_unpack_low_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-define <16 x float> @test_16xfloat_masked_unpack_low_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_masked_unpack_low_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-define <16 x float> @test_16xfloat_masked_unpack_low_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_masked_unpack_low_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-define <16 x float> @test_16xfloat_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2) {
-; GENERIC-LABEL: test_16xfloat_unpack_low_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_unpack_low_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-  ret <16 x float> %res
-}
-define <16 x float> @test_16xfloat_masked_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_masked_unpack_low_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-define <16 x float> @test_16xfloat_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
-; GENERIC-LABEL: test_16xfloat_unpack_low_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_unpack_low_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-  ret <16 x float> %res
-}
-define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
-; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
-; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
-; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
-; GENERIC-LABEL: test_16xfloat_unpack_low_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_unpack_low_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-  ret <16 x float> %res
-}
-define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
-; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-
-define <2 x double> @test_2xdouble_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2) {
-; GENERIC-LABEL: test_2xdouble_unpack_low_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_2xdouble_unpack_low_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
-  ret <2 x double> %res
-}
-define <2 x double> @test_2xdouble_masked_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
-; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_2xdouble_masked_unpack_low_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
-  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
-  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
-; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
-  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
-  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
-  ret <2 x double> %res
-}
-define <2 x double> @test_2xdouble_masked_unpack_low_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
-; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_2xdouble_masked_unpack_low_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
-  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
-  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
-; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
-  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
-  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
-  ret <2 x double> %res
-}
-define <2 x double> @test_2xdouble_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p) {
-; GENERIC-LABEL: test_2xdouble_unpack_low_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_2xdouble_unpack_low_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <2 x double>, <2 x double>* %vec2p
-  %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
-  ret <2 x double> %res
-}
-define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
-; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
-; GENERIC-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_2xdouble_masked_unpack_low_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
-; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <2 x double>, <2 x double>* %vec2p
-  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
-  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
-  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
-; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <2 x double>, <2 x double>* %vec2p
-  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
-  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
-  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
-; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
-; GENERIC-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_2xdouble_masked_unpack_low_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00]
-; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <2 x double>, <2 x double>* %vec2p
-  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
-  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
-  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
-; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <2 x double>, <2 x double>* %vec2p
-  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
-  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
-  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
-  ret <2 x double> %res
-}
-
-define <4 x double> @test_4xdouble_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2) {
-; GENERIC-LABEL: test_4xdouble_unpack_low_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_unpack_low_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-  ret <4 x double> %res
-}
-define <4 x double> @test_4xdouble_masked_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_masked_unpack_low_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-define <4 x double> @test_4xdouble_masked_unpack_low_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_masked_unpack_low_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-define <4 x double> @test_4xdouble_masked_unpack_low_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_masked_unpack_low_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-define <4 x double> @test_4xdouble_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2) {
-; GENERIC-LABEL: test_4xdouble_unpack_low_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_unpack_low_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-  ret <4 x double> %res
-}
-define <4 x double> @test_4xdouble_masked_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_masked_unpack_low_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-define <4 x double> @test_4xdouble_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
-; GENERIC-LABEL: test_4xdouble_unpack_low_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_unpack_low_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-  ret <4 x double> %res
-}
-define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
-; GENERIC-LABEL: test_4xdouble_unpack_low_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_unpack_low_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-  ret <4 x double> %res
-}
-define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-
-define <8 x double> @test_8xdouble_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2) {
-; GENERIC-LABEL: test_8xdouble_unpack_low_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_unpack_low_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  ret <8 x double> %res
-}
-define <8 x double> @test_8xdouble_masked_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_masked_unpack_low_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-define <8 x double> @test_8xdouble_masked_unpack_low_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_masked_unpack_low_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-define <8 x double> @test_8xdouble_masked_unpack_low_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_masked_unpack_low_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-define <8 x double> @test_8xdouble_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2) {
-; GENERIC-LABEL: test_8xdouble_unpack_low_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_unpack_low_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  ret <8 x double> %res
-}
-define <8 x double> @test_8xdouble_masked_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_masked_unpack_low_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-define <8 x double> @test_8xdouble_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
-; GENERIC-LABEL: test_8xdouble_unpack_low_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_unpack_low_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  ret <8 x double> %res
-}
-define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
-; GENERIC-LABEL: test_8xdouble_unpack_low_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_unpack_low_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  ret <8 x double> %res
-}
-define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-
-define <4 x float> @test_4xfloat_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2) {
-; GENERIC-LABEL: test_4xfloat_unpack_high_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_unpack_high_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-  ret <4 x float> %res
-}
-define <4 x float> @test_4xfloat_masked_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_masked_unpack_high_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
-  ret <4 x float> %res
-}
-define <4 x float> @test_4xfloat_masked_unpack_high_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_masked_unpack_high_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
-  ret <4 x float> %res
-}
-define <4 x float> @test_4xfloat_masked_unpack_high_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_masked_unpack_high_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
-  ret <4 x float> %res
-}
-define <4 x float> @test_4xfloat_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2) {
-; GENERIC-LABEL: test_4xfloat_unpack_high_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_unpack_high_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-  ret <4 x float> %res
-}
-define <4 x float> @test_4xfloat_masked_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_masked_unpack_high_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm3, %xmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
-  ret <4 x float> %res
-}
-define <4 x float> @test_4xfloat_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p) {
-; GENERIC-LABEL: test_4xfloat_unpack_high_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_unpack_high_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x float>, <4 x float>* %vec2p
-  %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-  ret <4 x float> %res
-}
-define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x float>, <4 x float>* %vec2p
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x float>, <4 x float>* %vec2p
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x float>, <4 x float>* %vec2p
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x float>, <4 x float>* %vec2p
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x float>, <4 x float>* %vec2p
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x float>, <4 x float>* %vec2p
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_4xfloat_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p) {
-; GENERIC-LABEL: test_4xfloat_unpack_high_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_unpack_high_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x float>, <4 x float>* %vec2p
-  %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-  ret <4 x float> %res
-}
-define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; GENERIC-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; SKX-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x float>, <4 x float>* %vec2p
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) {
-; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x float>, <4 x float>* %vec2p
-  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-  %cmp = icmp eq <4 x i32> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
-  ret <4 x float> %res
-}
-
-define <8 x float> @test_8xfloat_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2) {
-; GENERIC-LABEL: test_8xfloat_unpack_high_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_unpack_high_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-  ret <8 x float> %res
-}
-define <8 x float> @test_8xfloat_masked_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_masked_unpack_high_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-define <8 x float> @test_8xfloat_masked_unpack_high_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_masked_unpack_high_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-define <8 x float> @test_8xfloat_masked_unpack_high_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_masked_unpack_high_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-define <8 x float> @test_8xfloat_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2) {
-; GENERIC-LABEL: test_8xfloat_unpack_high_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_unpack_high_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-  ret <8 x float> %res
-}
-define <8 x float> @test_8xfloat_masked_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_masked_unpack_high_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-define <8 x float> @test_8xfloat_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
-; GENERIC-LABEL: test_8xfloat_unpack_high_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_unpack_high_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-  ret <8 x float> %res
-}
-define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
-; GENERIC-LABEL: test_8xfloat_unpack_high_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_unpack_high_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-  ret <8 x float> %res
-}
-define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; GENERIC-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3
-  ret <8 x float> %res
-}
-
-define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) {
-; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x float>, <8 x float>* %vec2p
-  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-  %cmp = icmp eq <8 x i32> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
-  ret <8 x float> %res
-}
-
-define <16 x float> @test_16xfloat_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2) {
-; GENERIC-LABEL: test_16xfloat_unpack_high_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_unpack_high_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-  ret <16 x float> %res
-}
-define <16 x float> @test_16xfloat_masked_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_masked_unpack_high_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-define <16 x float> @test_16xfloat_masked_unpack_high_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_masked_unpack_high_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-define <16 x float> @test_16xfloat_masked_unpack_high_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_masked_unpack_high_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-define <16 x float> @test_16xfloat_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2) {
-; GENERIC-LABEL: test_16xfloat_unpack_high_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_unpack_high_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-  ret <16 x float> %res
-}
-define <16 x float> @test_16xfloat_masked_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
-; GENERIC-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_masked_unpack_high_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
-; SKX-NEXT:    vmovaps %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-define <16 x float> @test_16xfloat_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
-; GENERIC-LABEL: test_16xfloat_unpack_high_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_unpack_high_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-  ret <16 x float> %res
-}
-define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
-; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
-; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
-; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
-; GENERIC-LABEL: test_16xfloat_unpack_high_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_unpack_high_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-  ret <16 x float> %res
-}
-define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
-; GENERIC-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
-; SKX-NEXT:    vmovaps %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) {
-; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmd %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <16 x float>, <16 x float>* %vec2p
-  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-  %cmp = icmp eq <16 x i32> %mask, zeroinitializer
-  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
-  ret <16 x float> %res
-}
-
-define <2 x double> @test_2xdouble_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2) {
-; GENERIC-LABEL: test_2xdouble_unpack_high_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_2xdouble_unpack_high_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
-  ret <2 x double> %res
-}
-define <2 x double> @test_2xdouble_masked_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
-; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_2xdouble_masked_unpack_high_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
-  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
-  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
-; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
-  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
-  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
-  ret <2 x double> %res
-}
-define <2 x double> @test_2xdouble_masked_unpack_high_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) {
-; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_2xdouble_masked_unpack_high_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %xmm3, %xmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
-  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
-  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) {
-; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
-  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
-  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
-  ret <2 x double> %res
-}
-define <2 x double> @test_2xdouble_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p) {
-; GENERIC-LABEL: test_2xdouble_unpack_high_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] sched: [7:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_2xdouble_unpack_high_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <2 x double>, <2 x double>* %vec2p
-  %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
-  ret <2 x double> %res
-}
-define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
-; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
-; GENERIC-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_2xdouble_masked_unpack_high_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
-; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <2 x double>, <2 x double>* %vec2p
-  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
-  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
-  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
-; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <2 x double>, <2 x double>* %vec2p
-  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
-  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
-  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) {
-; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
-; GENERIC-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_2xdouble_masked_unpack_high_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %xmm2, %xmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00]
-; SKX-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <2 x double>, <2 x double>* %vec2p
-  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
-  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
-  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) {
-; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %xmm1, %xmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <2 x double>, <2 x double>* %vec2p
-  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
-  %cmp = icmp eq <2 x i64> %mask, zeroinitializer
-  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
-  ret <2 x double> %res
-}
-
-define <4 x double> @test_4xdouble_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2) {
-; GENERIC-LABEL: test_4xdouble_unpack_high_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_unpack_high_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-  ret <4 x double> %res
-}
-define <4 x double> @test_4xdouble_masked_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_masked_unpack_high_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-define <4 x double> @test_4xdouble_masked_unpack_high_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_masked_unpack_high_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-define <4 x double> @test_4xdouble_masked_unpack_high_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_masked_unpack_high_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-define <4 x double> @test_4xdouble_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2) {
-; GENERIC-LABEL: test_4xdouble_unpack_high_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_unpack_high_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-  ret <4 x double> %res
-}
-define <4 x double> @test_4xdouble_masked_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_masked_unpack_high_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm3, %ymm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-define <4 x double> @test_4xdouble_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
-; GENERIC-LABEL: test_4xdouble_unpack_high_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_unpack_high_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-  ret <4 x double> %res
-}
-define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
-; GENERIC-LABEL: test_4xdouble_unpack_high_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_unpack_high_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-  ret <4 x double> %res
-}
-define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; GENERIC-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm2, %ymm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) {
-; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %ymm1, %ymm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <4 x double>, <4 x double>* %vec2p
-  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-  %cmp = icmp eq <4 x i64> %mask, zeroinitializer
-  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
-  ret <4 x double> %res
-}
-
-define <8 x double> @test_8xdouble_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2) {
-; GENERIC-LABEL: test_8xdouble_unpack_high_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_unpack_high_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  ret <8 x double> %res
-}
-define <8 x double> @test_8xdouble_masked_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_masked_unpack_high_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-define <8 x double> @test_8xdouble_masked_unpack_high_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_masked_unpack_high_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-define <8 x double> @test_8xdouble_masked_unpack_high_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_masked_unpack_high_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-define <8 x double> @test_8xdouble_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2) {
-; GENERIC-LABEL: test_8xdouble_unpack_high_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_unpack_high_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  ret <8 x double> %res
-}
-define <8 x double> @test_8xdouble_masked_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; GENERIC-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_masked_unpack_high_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm3, %zmm3, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; SKX-NEXT:    vmovapd %zmm2, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-define <8 x double> @test_8xdouble_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
-; GENERIC-LABEL: test_8xdouble_unpack_high_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_unpack_high_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  ret <8 x double> %res
-}
-define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask0:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask0:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask1:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask1:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask2:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask2:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
-; GENERIC-LABEL: test_8xdouble_unpack_high_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_unpack_high_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  ret <8 x double> %res
-}
-define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; GENERIC-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm2, %zmm2, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    vmovapd %zmm1, %zmm0 # sched: [1:0.33]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) {
-; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask3:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask3:
-; SKX:       # %bb.0:
-; SKX-NEXT:    vptestnmq %zmm1, %zmm1, %k1 # sched: [3:1.00]
-; SKX-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  %vec2 = load <8 x double>, <8 x double>* %vec2p
-  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-  %cmp = icmp eq <8 x i64> %mask, zeroinitializer
-  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
-  ret <8 x double> %res
-}
-

Removed: llvm/trunk/test/CodeGen/X86/avx512vpopcntdq-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512vpopcntdq-schedule.ll?rev=353042&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512vpopcntdq-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512vpopcntdq-schedule.ll (removed)
@@ -1,80 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=GENERIC
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=icelake-client | FileCheck %s --check-prefix=ICELAKE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=icelake-server | FileCheck %s --check-prefix=ICELAKE
-
-define void @test_vpopcntd(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> *%a2, i16 %a3) {
-; GENERIC-LABEL: test_vpopcntd:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    kmovw %esi, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:    vpopcntd %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    vpopcntd %zmm1, %zmm0 {%k1} # sched: [1:0.50]
-; GENERIC-NEXT:    vpopcntd %zmm1, %zmm0 {%k1} {z} # sched: [1:0.50]
-; GENERIC-NEXT:    vpopcntd (%rdi), %zmm0 # sched: [8:0.50]
-; GENERIC-NEXT:    vpopcntd (%rdi), %zmm0 {%k1} # sched: [8:0.50]
-; GENERIC-NEXT:    vpopcntd (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50]
-; GENERIC-NEXT:    vpopcntd (%rdi){1to16}, %zmm0 # sched: [8:0.50]
-; GENERIC-NEXT:    vpopcntd (%rdi){1to16}, %zmm0 {%k1} # sched: [8:0.50]
-; GENERIC-NEXT:    vpopcntd (%rdi){1to16}, %zmm0 {%k1} {z} # sched: [8:0.50]
-; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; ICELAKE-LABEL: test_vpopcntd:
-; ICELAKE:       # %bb.0:
-; ICELAKE-NEXT:    kmovd %esi, %k1 # sched: [1:1.00]
-; ICELAKE-NEXT:    #APP
-; ICELAKE-NEXT:    vpopcntd %zmm1, %zmm0 # sched: [1:1.00]
-; ICELAKE-NEXT:    vpopcntd %zmm1, %zmm0 {%k1} # sched: [1:1.00]
-; ICELAKE-NEXT:    vpopcntd %zmm1, %zmm0 {%k1} {z} # sched: [1:1.00]
-; ICELAKE-NEXT:    vpopcntd (%rdi), %zmm0 # sched: [8:1.00]
-; ICELAKE-NEXT:    vpopcntd (%rdi), %zmm0 {%k1} # sched: [8:1.00]
-; ICELAKE-NEXT:    vpopcntd (%rdi), %zmm0 {%k1} {z} # sched: [8:1.00]
-; ICELAKE-NEXT:    vpopcntd (%rdi){1to16}, %zmm0 # sched: [8:1.00]
-; ICELAKE-NEXT:    vpopcntd (%rdi){1to16}, %zmm0 {%k1} # sched: [8:1.00]
-; ICELAKE-NEXT:    vpopcntd (%rdi){1to16}, %zmm0 {%k1} {z} # sched: [8:1.00]
-; ICELAKE-NEXT:    #NO_APP
-; ICELAKE-NEXT:    vzeroupper # sched: [0:0.67]
-; ICELAKE-NEXT:    retq # sched: [7:1.00]
-  tail call void asm "vpopcntd $1, $0 \0A\09 vpopcntd $1, $0 {$3} \0A\09 vpopcntd $1, $0 {$3} {z} \0A\09 vpopcntd $2, $0 \0A\09 vpopcntd $2, $0 {$3} \0A\09 vpopcntd $2, $0 {$3} {z} \0A\09 vpopcntd $2{1to16}, $0 \0A\09 vpopcntd $2{1to16}, $0 {$3} \0A\09 vpopcntd $2{1to16}, $0 {$3} {z}", "v,v,*m,^Yk"(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> *%a2, i16 %a3) nounwind
-  ret void
-}
-
-define void @test_vpopcntq(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> *%a2, i8 %a3) {
-; GENERIC-LABEL: test_vpopcntq:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    kmovw %esi, %k1 # sched: [1:0.33]
-; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:    vpopcntq %zmm1, %zmm0 # sched: [1:0.50]
-; GENERIC-NEXT:    vpopcntq %zmm1, %zmm0 {%k1} # sched: [1:0.50]
-; GENERIC-NEXT:    vpopcntq %zmm1, %zmm0 {%k1} {z} # sched: [1:0.50]
-; GENERIC-NEXT:    vpopcntq (%rdi), %zmm0 # sched: [8:0.50]
-; GENERIC-NEXT:    vpopcntq (%rdi), %zmm0 {%k1} # sched: [8:0.50]
-; GENERIC-NEXT:    vpopcntq (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50]
-; GENERIC-NEXT:    vpopcntq (%rdi){1to8}, %zmm0 # sched: [8:0.50]
-; GENERIC-NEXT:    vpopcntq (%rdi){1to8}, %zmm0 {%k1} # sched: [8:0.50]
-; GENERIC-NEXT:    vpopcntq (%rdi){1to8}, %zmm0 {%k1} {z} # sched: [8:0.50]
-; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    vzeroupper # sched: [1:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; ICELAKE-LABEL: test_vpopcntq:
-; ICELAKE:       # %bb.0:
-; ICELAKE-NEXT:    kmovd %esi, %k1 # sched: [1:1.00]
-; ICELAKE-NEXT:    #APP
-; ICELAKE-NEXT:    vpopcntq %zmm1, %zmm0 # sched: [1:1.00]
-; ICELAKE-NEXT:    vpopcntq %zmm1, %zmm0 {%k1} # sched: [1:1.00]
-; ICELAKE-NEXT:    vpopcntq %zmm1, %zmm0 {%k1} {z} # sched: [1:1.00]
-; ICELAKE-NEXT:    vpopcntq (%rdi), %zmm0 # sched: [8:1.00]
-; ICELAKE-NEXT:    vpopcntq (%rdi), %zmm0 {%k1} # sched: [8:1.00]
-; ICELAKE-NEXT:    vpopcntq (%rdi), %zmm0 {%k1} {z} # sched: [8:1.00]
-; ICELAKE-NEXT:    vpopcntq (%rdi){1to8}, %zmm0 # sched: [8:1.00]
-; ICELAKE-NEXT:    vpopcntq (%rdi){1to8}, %zmm0 {%k1} # sched: [8:1.00]
-; ICELAKE-NEXT:    vpopcntq (%rdi){1to8}, %zmm0 {%k1} {z} # sched: [8:1.00]
-; ICELAKE-NEXT:    #NO_APP
-; ICELAKE-NEXT:    vzeroupper # sched: [0:0.67]
-; ICELAKE-NEXT:    retq # sched: [7:1.00]
-  tail call void asm "vpopcntq $1, $0 \0A\09 vpopcntq $1, $0 {$3} \0A\09 vpopcntq $1, $0 {$3} {z} \0A\09 vpopcntq $2, $0 \0A\09 vpopcntq $2, $0 {$3} \0A\09 vpopcntq $2, $0 {$3} {z} \0A\09 vpopcntq $2{1to8}, $0 \0A\09 vpopcntq $2{1to8}, $0 {$3} \0A\09 vpopcntq $2{1to8}, $0 {$3} {z}", "v,v,*m,^Yk"(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> *%a2, i8 %a3) nounwind
-  ret void
-}

Removed: llvm/trunk/test/CodeGen/X86/bmi-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/bmi-schedule.ll?rev=353042&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/bmi-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/bmi-schedule.ll (removed)
@@ -1,763 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+bmi | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl     | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=bdver2  | FileCheck %s --check-prefix=CHECK --check-prefix=BDVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2  | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1  | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
-
-define i32 @test_andn_i32(i32 %a0, i32 %a1, i32 *%a2) {
-; GENERIC-LABEL: test_andn_i32:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    andnl %esi, %edi, %ecx # sched: [1:0.33]
-; GENERIC-NEXT:    andnl (%rdx), %edi, %eax # sched: [6:0.50]
-; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_andn_i32:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    andnl %esi, %edi, %ecx # sched: [1:0.50]
-; HASWELL-NEXT:    andnl (%rdx), %edi, %eax # sched: [6:0.50]
-; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_andn_i32:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    andnl %esi, %edi, %ecx # sched: [1:0.50]
-; BROADWELL-NEXT:    andnl (%rdx), %edi, %eax # sched: [6:0.50]
-; BROADWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_andn_i32:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    andnl %esi, %edi, %ecx # sched: [1:0.50]
-; SKYLAKE-NEXT:    andnl (%rdx), %edi, %eax # sched: [6:0.50]
-; SKYLAKE-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; BDVER2-LABEL: test_andn_i32:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    andnl (%rdx), %edi, %eax # sched: [5:0.50]
-; BDVER2-NEXT:    andnl %esi, %edi, %ecx # sched: [1:0.50]
-; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_andn_i32:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    andnl (%rdx), %edi, %eax # sched: [4:1.00]
-; BTVER2-NEXT:    andnl %esi, %edi, %ecx # sched: [1:0.50]
-; BTVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_andn_i32:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    andnl (%rdx), %edi, %eax # sched: [5:0.50]
-; ZNVER1-NEXT:    andnl %esi, %edi, %ecx # sched: [1:0.25]
-; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = load i32, i32 *%a2
-  %2 = xor i32 %a0, -1
-  %3 = and i32 %2, %a1
-  %4 = and i32 %2, %1
-  %5 = add i32 %3, %4
-  ret i32 %5
-}
-
-define i64 @test_andn_i64(i64 %a0, i64 %a1, i64 *%a2) {
-; GENERIC-LABEL: test_andn_i64:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    andnq %rsi, %rdi, %rcx # sched: [1:0.33]
-; GENERIC-NEXT:    andnq (%rdx), %rdi, %rax # sched: [6:0.50]
-; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_andn_i64:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    andnq %rsi, %rdi, %rcx # sched: [1:0.50]
-; HASWELL-NEXT:    andnq (%rdx), %rdi, %rax # sched: [6:0.50]
-; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_andn_i64:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    andnq %rsi, %rdi, %rcx # sched: [1:0.50]
-; BROADWELL-NEXT:    andnq (%rdx), %rdi, %rax # sched: [6:0.50]
-; BROADWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_andn_i64:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    andnq %rsi, %rdi, %rcx # sched: [1:0.50]
-; SKYLAKE-NEXT:    andnq (%rdx), %rdi, %rax # sched: [6:0.50]
-; SKYLAKE-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; BDVER2-LABEL: test_andn_i64:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    andnq (%rdx), %rdi, %rax # sched: [5:0.50]
-; BDVER2-NEXT:    andnq %rsi, %rdi, %rcx # sched: [1:0.50]
-; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_andn_i64:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    andnq (%rdx), %rdi, %rax # sched: [4:1.00]
-; BTVER2-NEXT:    andnq %rsi, %rdi, %rcx # sched: [1:0.50]
-; BTVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_andn_i64:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    andnq (%rdx), %rdi, %rax # sched: [5:0.50]
-; ZNVER1-NEXT:    andnq %rsi, %rdi, %rcx # sched: [1:0.25]
-; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = load i64, i64 *%a2
-  %2 = xor i64 %a0, -1
-  %3 = and i64 %2, %a1
-  %4 = and i64 %2, %1
-  %5 = add i64 %3, %4
-  ret i64 %5
-}
-
-define i32 @test_bextr_i32(i32 %a0, i32 %a1, i32 *%a2) {
-; GENERIC-LABEL: test_bextr_i32:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    bextrl %edi, (%rdx), %ecx # sched: [7:1.00]
-; GENERIC-NEXT:    bextrl %edi, %esi, %eax # sched: [2:1.00]
-; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_bextr_i32:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    bextrl %edi, (%rdx), %ecx # sched: [7:0.50]
-; HASWELL-NEXT:    bextrl %edi, %esi, %eax # sched: [2:0.50]
-; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_bextr_i32:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    bextrl %edi, (%rdx), %ecx # sched: [7:0.50]
-; BROADWELL-NEXT:    bextrl %edi, %esi, %eax # sched: [2:0.50]
-; BROADWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_bextr_i32:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    bextrl %edi, (%rdx), %ecx # sched: [7:0.50]
-; SKYLAKE-NEXT:    bextrl %edi, %esi, %eax # sched: [2:0.50]
-; SKYLAKE-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; BDVER2-LABEL: test_bextr_i32:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    bextrl %edi, (%rdx), %ecx # sched: [6:0.50]
-; BDVER2-NEXT:    bextrl %edi, %esi, %eax # sched: [2:0.50]
-; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_bextr_i32:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    bextrl %edi, (%rdx), %ecx # sched: [4:1.00]
-; BTVER2-NEXT:    bextrl %edi, %esi, %eax # sched: [1:0.50]
-; BTVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_bextr_i32:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    bextrl %edi, (%rdx), %ecx # sched: [5:0.50]
-; ZNVER1-NEXT:    bextrl %edi, %esi, %eax # sched: [1:0.25]
-; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = load i32, i32 *%a2
-  %2 = tail call i32 @llvm.x86.bmi.bextr.32(i32 %1, i32 %a0)
-  %3 = tail call i32 @llvm.x86.bmi.bextr.32(i32 %a1, i32 %a0)
-  %4 = add i32 %2, %3
-  ret i32 %4
-}
-declare i32 @llvm.x86.bmi.bextr.32(i32, i32)
-
-define i64 @test_bextr_i64(i64 %a0, i64 %a1, i64 *%a2) {
-; GENERIC-LABEL: test_bextr_i64:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    bextrq %rdi, (%rdx), %rcx # sched: [7:1.00]
-; GENERIC-NEXT:    bextrq %rdi, %rsi, %rax # sched: [2:1.00]
-; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_bextr_i64:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    bextrq %rdi, (%rdx), %rcx # sched: [7:0.50]
-; HASWELL-NEXT:    bextrq %rdi, %rsi, %rax # sched: [2:0.50]
-; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_bextr_i64:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    bextrq %rdi, (%rdx), %rcx # sched: [7:0.50]
-; BROADWELL-NEXT:    bextrq %rdi, %rsi, %rax # sched: [2:0.50]
-; BROADWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_bextr_i64:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    bextrq %rdi, (%rdx), %rcx # sched: [7:0.50]
-; SKYLAKE-NEXT:    bextrq %rdi, %rsi, %rax # sched: [2:0.50]
-; SKYLAKE-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; BDVER2-LABEL: test_bextr_i64:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    bextrq %rdi, (%rdx), %rcx # sched: [6:0.50]
-; BDVER2-NEXT:    bextrq %rdi, %rsi, %rax # sched: [2:0.50]
-; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_bextr_i64:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    bextrq %rdi, (%rdx), %rcx # sched: [4:1.00]
-; BTVER2-NEXT:    bextrq %rdi, %rsi, %rax # sched: [1:0.50]
-; BTVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_bextr_i64:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    bextrq %rdi, (%rdx), %rcx # sched: [5:0.50]
-; ZNVER1-NEXT:    bextrq %rdi, %rsi, %rax # sched: [1:0.25]
-; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = load i64, i64 *%a2
-  %2 = tail call i64 @llvm.x86.bmi.bextr.64(i64 %1, i64 %a0)
-  %3 = tail call i64 @llvm.x86.bmi.bextr.64(i64 %a1, i64 %a0)
-  %4 = add i64 %2, %3
-  ret i64 %4
-}
-declare i64 @llvm.x86.bmi.bextr.64(i64, i64)
-
-define i32 @test_blsi_i32(i32 %a0, i32 *%a1) {
-; GENERIC-LABEL: test_blsi_i32:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    blsil (%rsi), %ecx # sched: [6:0.50]
-; GENERIC-NEXT:    blsil %edi, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_blsi_i32:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    blsil (%rsi), %ecx # sched: [6:0.50]
-; HASWELL-NEXT:    blsil %edi, %eax # sched: [1:0.50]
-; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_blsi_i32:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    blsil (%rsi), %ecx # sched: [6:0.50]
-; BROADWELL-NEXT:    blsil %edi, %eax # sched: [1:0.50]
-; BROADWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_blsi_i32:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    blsil (%rsi), %ecx # sched: [6:0.50]
-; SKYLAKE-NEXT:    blsil %edi, %eax # sched: [1:0.50]
-; SKYLAKE-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; BDVER2-LABEL: test_blsi_i32:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    blsil (%rsi), %ecx # sched: [6:0.50]
-; BDVER2-NEXT:    blsil %edi, %eax # sched: [2:0.50]
-; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_blsi_i32:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    blsil (%rsi), %ecx # sched: [5:1.00]
-; BTVER2-NEXT:    blsil %edi, %eax # sched: [2:1.00]
-; BTVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_blsi_i32:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    blsil (%rsi), %ecx # sched: [6:0.50]
-; ZNVER1-NEXT:    blsil %edi, %eax # sched: [2:0.25]
-; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = load i32, i32 *%a1
-  %2 = sub i32 0, %1
-  %3 = sub i32 0, %a0
-  %4 = and i32 %1, %2
-  %5 = and i32 %a0, %3
-  %6 = add i32 %4, %5
-  ret i32 %6
-}
-
-define i64 @test_blsi_i64(i64 %a0, i64 *%a1) {
-; GENERIC-LABEL: test_blsi_i64:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    blsiq (%rsi), %rcx # sched: [6:0.50]
-; GENERIC-NEXT:    blsiq %rdi, %rax # sched: [1:0.33]
-; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_blsi_i64:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    blsiq (%rsi), %rcx # sched: [6:0.50]
-; HASWELL-NEXT:    blsiq %rdi, %rax # sched: [1:0.50]
-; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_blsi_i64:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    blsiq (%rsi), %rcx # sched: [6:0.50]
-; BROADWELL-NEXT:    blsiq %rdi, %rax # sched: [1:0.50]
-; BROADWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_blsi_i64:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    blsiq (%rsi), %rcx # sched: [6:0.50]
-; SKYLAKE-NEXT:    blsiq %rdi, %rax # sched: [1:0.50]
-; SKYLAKE-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; BDVER2-LABEL: test_blsi_i64:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    blsiq (%rsi), %rcx # sched: [6:0.50]
-; BDVER2-NEXT:    blsiq %rdi, %rax # sched: [2:0.50]
-; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_blsi_i64:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    blsiq (%rsi), %rcx # sched: [5:1.00]
-; BTVER2-NEXT:    blsiq %rdi, %rax # sched: [2:1.00]
-; BTVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_blsi_i64:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    blsiq (%rsi), %rcx # sched: [6:0.50]
-; ZNVER1-NEXT:    blsiq %rdi, %rax # sched: [2:0.25]
-; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = load i64, i64 *%a1
-  %2 = sub i64 0, %1
-  %3 = sub i64 0, %a0
-  %4 = and i64 %1, %2
-  %5 = and i64 %a0, %3
-  %6 = add i64 %4, %5
-  ret i64 %6
-}
-
-define i32 @test_blsmsk_i32(i32 %a0, i32 *%a1) {
-; GENERIC-LABEL: test_blsmsk_i32:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    blsmskl (%rsi), %ecx # sched: [6:0.50]
-; GENERIC-NEXT:    blsmskl %edi, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_blsmsk_i32:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    blsmskl (%rsi), %ecx # sched: [6:0.50]
-; HASWELL-NEXT:    blsmskl %edi, %eax # sched: [1:0.50]
-; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_blsmsk_i32:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    blsmskl (%rsi), %ecx # sched: [6:0.50]
-; BROADWELL-NEXT:    blsmskl %edi, %eax # sched: [1:0.50]
-; BROADWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_blsmsk_i32:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    blsmskl (%rsi), %ecx # sched: [6:0.50]
-; SKYLAKE-NEXT:    blsmskl %edi, %eax # sched: [1:0.50]
-; SKYLAKE-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; BDVER2-LABEL: test_blsmsk_i32:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    blsmskl (%rsi), %ecx # sched: [6:0.50]
-; BDVER2-NEXT:    blsmskl %edi, %eax # sched: [2:0.50]
-; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_blsmsk_i32:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    blsmskl (%rsi), %ecx # sched: [5:1.00]
-; BTVER2-NEXT:    blsmskl %edi, %eax # sched: [2:1.00]
-; BTVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_blsmsk_i32:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    blsmskl (%rsi), %ecx # sched: [6:0.50]
-; ZNVER1-NEXT:    blsmskl %edi, %eax # sched: [2:0.25]
-; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = load i32, i32 *%a1
-  %2 = sub i32 %1, 1
-  %3 = sub i32 %a0, 1
-  %4 = xor i32 %1, %2
-  %5 = xor i32 %a0, %3
-  %6 = add i32 %4, %5
-  ret i32 %6
-}
-
-define i64 @test_blsmsk_i64(i64 %a0, i64 *%a1) {
-; GENERIC-LABEL: test_blsmsk_i64:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    blsmskq (%rsi), %rcx # sched: [6:0.50]
-; GENERIC-NEXT:    blsmskq %rdi, %rax # sched: [1:0.33]
-; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_blsmsk_i64:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    blsmskq (%rsi), %rcx # sched: [6:0.50]
-; HASWELL-NEXT:    blsmskq %rdi, %rax # sched: [1:0.50]
-; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_blsmsk_i64:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    blsmskq (%rsi), %rcx # sched: [6:0.50]
-; BROADWELL-NEXT:    blsmskq %rdi, %rax # sched: [1:0.50]
-; BROADWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_blsmsk_i64:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    blsmskq (%rsi), %rcx # sched: [6:0.50]
-; SKYLAKE-NEXT:    blsmskq %rdi, %rax # sched: [1:0.50]
-; SKYLAKE-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; BDVER2-LABEL: test_blsmsk_i64:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    blsmskq (%rsi), %rcx # sched: [6:0.50]
-; BDVER2-NEXT:    blsmskq %rdi, %rax # sched: [2:0.50]
-; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_blsmsk_i64:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    blsmskq (%rsi), %rcx # sched: [5:1.00]
-; BTVER2-NEXT:    blsmskq %rdi, %rax # sched: [2:1.00]
-; BTVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_blsmsk_i64:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    blsmskq (%rsi), %rcx # sched: [6:0.50]
-; ZNVER1-NEXT:    blsmskq %rdi, %rax # sched: [2:0.25]
-; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = load i64, i64 *%a1
-  %2 = sub i64 %1, 1
-  %3 = sub i64 %a0, 1
-  %4 = xor i64 %1, %2
-  %5 = xor i64 %a0, %3
-  %6 = add i64 %4, %5
-  ret i64 %6
-}
-
-define i32 @test_blsr_i32(i32 %a0, i32 *%a1) {
-; GENERIC-LABEL: test_blsr_i32:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    blsrl (%rsi), %ecx # sched: [6:0.50]
-; GENERIC-NEXT:    blsrl %edi, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_blsr_i32:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    blsrl (%rsi), %ecx # sched: [6:0.50]
-; HASWELL-NEXT:    blsrl %edi, %eax # sched: [1:0.50]
-; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_blsr_i32:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    blsrl (%rsi), %ecx # sched: [6:0.50]
-; BROADWELL-NEXT:    blsrl %edi, %eax # sched: [1:0.50]
-; BROADWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_blsr_i32:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    blsrl (%rsi), %ecx # sched: [6:0.50]
-; SKYLAKE-NEXT:    blsrl %edi, %eax # sched: [1:0.50]
-; SKYLAKE-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; BDVER2-LABEL: test_blsr_i32:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    blsrl (%rsi), %ecx # sched: [6:0.50]
-; BDVER2-NEXT:    blsrl %edi, %eax # sched: [2:0.50]
-; BDVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_blsr_i32:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    blsrl (%rsi), %ecx # sched: [5:1.00]
-; BTVER2-NEXT:    blsrl %edi, %eax # sched: [2:1.00]
-; BTVER2-NEXT:    addl %ecx, %eax # sched: [1:0.50]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_blsr_i32:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    blsrl (%rsi), %ecx # sched: [6:0.50]
-; ZNVER1-NEXT:    blsrl %edi, %eax # sched: [2:0.25]
-; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = load i32, i32 *%a1
-  %2 = sub i32 %1, 1
-  %3 = sub i32 %a0, 1
-  %4 = and i32 %1, %2
-  %5 = and i32 %a0, %3
-  %6 = add i32 %4, %5
-  ret i32 %6
-}
-
-define i64 @test_blsr_i64(i64 %a0, i64 *%a1) {
-; GENERIC-LABEL: test_blsr_i64:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    blsrq (%rsi), %rcx # sched: [6:0.50]
-; GENERIC-NEXT:    blsrq %rdi, %rax # sched: [1:0.33]
-; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_blsr_i64:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    blsrq (%rsi), %rcx # sched: [6:0.50]
-; HASWELL-NEXT:    blsrq %rdi, %rax # sched: [1:0.50]
-; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_blsr_i64:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    blsrq (%rsi), %rcx # sched: [6:0.50]
-; BROADWELL-NEXT:    blsrq %rdi, %rax # sched: [1:0.50]
-; BROADWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_blsr_i64:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    blsrq (%rsi), %rcx # sched: [6:0.50]
-; SKYLAKE-NEXT:    blsrq %rdi, %rax # sched: [1:0.50]
-; SKYLAKE-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; BDVER2-LABEL: test_blsr_i64:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    blsrq (%rsi), %rcx # sched: [6:0.50]
-; BDVER2-NEXT:    blsrq %rdi, %rax # sched: [2:0.50]
-; BDVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_blsr_i64:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    blsrq (%rsi), %rcx # sched: [5:1.00]
-; BTVER2-NEXT:    blsrq %rdi, %rax # sched: [2:1.00]
-; BTVER2-NEXT:    addq %rcx, %rax # sched: [1:0.50]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_blsr_i64:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    blsrq (%rsi), %rcx # sched: [6:0.50]
-; ZNVER1-NEXT:    blsrq %rdi, %rax # sched: [2:0.25]
-; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = load i64, i64 *%a1
-  %2 = sub i64 %1, 1
-  %3 = sub i64 %a0, 1
-  %4 = and i64 %1, %2
-  %5 = and i64 %a0, %3
-  %6 = add i64 %4, %5
-  ret i64 %6
-}
-
-define i16 @test_cttz_i16(i16 zeroext %a0, i16 *%a1) {
-; GENERIC-LABEL: test_cttz_i16:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    tzcntw (%rsi), %cx # sched: [8:1.00]
-; GENERIC-NEXT:    tzcntw %di, %ax # sched: [3:1.00]
-; GENERIC-NEXT:    orl %ecx, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    # kill: def $ax killed $ax killed $eax
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_cttz_i16:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    tzcntw (%rsi), %cx # sched: [8:1.00]
-; HASWELL-NEXT:    tzcntw %di, %ax # sched: [3:1.00]
-; HASWELL-NEXT:    orl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    # kill: def $ax killed $ax killed $eax
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_cttz_i16:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    tzcntw (%rsi), %cx # sched: [8:1.00]
-; BROADWELL-NEXT:    tzcntw %di, %ax # sched: [3:1.00]
-; BROADWELL-NEXT:    orl %ecx, %eax # sched: [1:0.25]
-; BROADWELL-NEXT:    # kill: def $ax killed $ax killed $eax
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_cttz_i16:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    tzcntw (%rsi), %cx # sched: [8:1.00]
-; SKYLAKE-NEXT:    tzcntw %di, %ax # sched: [3:1.00]
-; SKYLAKE-NEXT:    orl %ecx, %eax # sched: [1:0.25]
-; SKYLAKE-NEXT:    # kill: def $ax killed $ax killed $eax
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; BDVER2-LABEL: test_cttz_i16:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    tzcntw (%rsi), %cx # sched: [6:1.00]
-; BDVER2-NEXT:    tzcntw %di, %ax # sched: [2:1.00]
-; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
-; BDVER2-NEXT:    # kill: def $ax killed $ax killed $eax
-; BDVER2-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_cttz_i16:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    tzcntw (%rsi), %cx # sched: [5:1.00]
-; BTVER2-NEXT:    tzcntw %di, %ax # sched: [2:1.00]
-; BTVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
-; BTVER2-NEXT:    # kill: def $ax killed $ax killed $eax
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_cttz_i16:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    tzcntw (%rsi), %cx # sched: [6:0.50]
-; ZNVER1-NEXT:    tzcntw %di, %ax # sched: [2:0.25]
-; ZNVER1-NEXT:    orl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT:    # kill: def $ax killed $ax killed $eax
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = load i16, i16 *%a1
-  %2 = tail call i16 @llvm.cttz.i16( i16 %1, i1 false )
-  %3 = tail call i16 @llvm.cttz.i16( i16 %a0, i1 false )
-  %4 = or i16 %2, %3
-  ret i16 %4
-}
-declare i16 @llvm.cttz.i16(i16, i1)
-
-define i32 @test_cttz_i32(i32 %a0, i32 *%a1) {
-; GENERIC-LABEL: test_cttz_i32:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    tzcntl (%rsi), %ecx # sched: [8:1.00]
-; GENERIC-NEXT:    tzcntl %edi, %eax # sched: [3:1.00]
-; GENERIC-NEXT:    orl %ecx, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_cttz_i32:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    tzcntl (%rsi), %ecx # sched: [8:1.00]
-; HASWELL-NEXT:    tzcntl %edi, %eax # sched: [3:1.00]
-; HASWELL-NEXT:    orl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_cttz_i32:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    tzcntl (%rsi), %ecx # sched: [8:1.00]
-; BROADWELL-NEXT:    tzcntl %edi, %eax # sched: [3:1.00]
-; BROADWELL-NEXT:    orl %ecx, %eax # sched: [1:0.25]
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_cttz_i32:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    tzcntl (%rsi), %ecx # sched: [8:1.00]
-; SKYLAKE-NEXT:    tzcntl %edi, %eax # sched: [3:1.00]
-; SKYLAKE-NEXT:    orl %ecx, %eax # sched: [1:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; BDVER2-LABEL: test_cttz_i32:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    tzcntl (%rsi), %ecx # sched: [6:1.00]
-; BDVER2-NEXT:    tzcntl %edi, %eax # sched: [2:1.00]
-; BDVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_cttz_i32:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    tzcntl (%rsi), %ecx # sched: [5:1.00]
-; BTVER2-NEXT:    tzcntl %edi, %eax # sched: [2:1.00]
-; BTVER2-NEXT:    orl %ecx, %eax # sched: [1:0.50]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_cttz_i32:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    tzcntl (%rsi), %ecx # sched: [6:0.50]
-; ZNVER1-NEXT:    tzcntl %edi, %eax # sched: [2:0.25]
-; ZNVER1-NEXT:    orl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = load i32, i32 *%a1
-  %2 = tail call i32 @llvm.cttz.i32( i32 %1, i1 false )
-  %3 = tail call i32 @llvm.cttz.i32( i32 %a0, i1 false )
-  %4 = or i32 %2, %3
-  ret i32 %4
-}
-declare i32 @llvm.cttz.i32(i32, i1)
-
-define i64 @test_cttz_i64(i64 %a0, i64 *%a1) {
-; GENERIC-LABEL: test_cttz_i64:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    tzcntq (%rsi), %rcx # sched: [8:1.00]
-; GENERIC-NEXT:    tzcntq %rdi, %rax # sched: [3:1.00]
-; GENERIC-NEXT:    orq %rcx, %rax # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_cttz_i64:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    tzcntq (%rsi), %rcx # sched: [8:1.00]
-; HASWELL-NEXT:    tzcntq %rdi, %rax # sched: [3:1.00]
-; HASWELL-NEXT:    orq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_cttz_i64:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    tzcntq (%rsi), %rcx # sched: [8:1.00]
-; BROADWELL-NEXT:    tzcntq %rdi, %rax # sched: [3:1.00]
-; BROADWELL-NEXT:    orq %rcx, %rax # sched: [1:0.25]
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_cttz_i64:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    tzcntq (%rsi), %rcx # sched: [8:1.00]
-; SKYLAKE-NEXT:    tzcntq %rdi, %rax # sched: [3:1.00]
-; SKYLAKE-NEXT:    orq %rcx, %rax # sched: [1:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; BDVER2-LABEL: test_cttz_i64:
-; BDVER2:       # %bb.0:
-; BDVER2-NEXT:    tzcntq (%rsi), %rcx # sched: [6:1.00]
-; BDVER2-NEXT:    tzcntq %rdi, %rax # sched: [2:1.00]
-; BDVER2-NEXT:    orq %rcx, %rax # sched: [1:0.50]
-; BDVER2-NEXT:    retq # sched: [5:1.00]
-;
-; BTVER2-LABEL: test_cttz_i64:
-; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    tzcntq (%rsi), %rcx # sched: [5:1.00]
-; BTVER2-NEXT:    tzcntq %rdi, %rax # sched: [2:1.00]
-; BTVER2-NEXT:    orq %rcx, %rax # sched: [1:0.50]
-; BTVER2-NEXT:    retq # sched: [4:1.00]
-;
-; ZNVER1-LABEL: test_cttz_i64:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    tzcntq (%rsi), %rcx # sched: [6:0.50]
-; ZNVER1-NEXT:    tzcntq %rdi, %rax # sched: [2:0.25]
-; ZNVER1-NEXT:    orq %rcx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = load i64, i64 *%a1
-  %2 = tail call i64 @llvm.cttz.i64( i64 %1, i1 false )
-  %3 = tail call i64 @llvm.cttz.i64( i64 %a0, i1 false )
-  %4 = or i64 %2, %3
-  ret i64 %4
-}
-declare i64 @llvm.cttz.i64(i64, i1)

Removed: llvm/trunk/test/CodeGen/X86/bmi2-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/bmi2-schedule.ll?rev=353042&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/bmi2-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/bmi2-schedule.ll (removed)
@@ -1,811 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+bmi2 | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=broadwell | FileCheck %s --check-prefix=CHECK --check-prefix=BROADWELL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl     | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1  | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
-
-define i32 @test_bzhi_i32(i32 %a0, i32 %a1, i32 *%a2) {
-; GENERIC-LABEL: test_bzhi_i32:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    bzhil %edi, (%rdx), %ecx # sched: [6:1.00]
-; GENERIC-NEXT:    bzhil %edi, %esi, %eax # sched: [1:1.00]
-; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_bzhi_i32:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    bzhil %edi, (%rdx), %ecx # sched: [6:0.50]
-; HASWELL-NEXT:    bzhil %edi, %esi, %eax # sched: [1:0.50]
-; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_bzhi_i32:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    bzhil %edi, (%rdx), %ecx # sched: [6:0.50]
-; BROADWELL-NEXT:    bzhil %edi, %esi, %eax # sched: [1:0.50]
-; BROADWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_bzhi_i32:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    bzhil %edi, (%rdx), %ecx # sched: [6:0.50]
-; SKYLAKE-NEXT:    bzhil %edi, %esi, %eax # sched: [1:0.50]
-; SKYLAKE-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: test_bzhi_i32:
-; KNL:       # %bb.0:
-; KNL-NEXT:    bzhil %edi, (%rdx), %ecx # sched: [6:0.50]
-; KNL-NEXT:    bzhil %edi, %esi, %eax # sched: [1:0.50]
-; KNL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; ZNVER1-LABEL: test_bzhi_i32:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    bzhil %edi, (%rdx), %ecx # sched: [5:0.50]
-; ZNVER1-NEXT:    bzhil %edi, %esi, %eax # sched: [1:0.25]
-; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = load i32, i32 *%a2
-  %2 = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %1, i32 %a0)
-  %3 = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %a1, i32 %a0)
-  %4 = add i32 %2, %3
-  ret i32 %4
-}
-declare i32 @llvm.x86.bmi.bzhi.32(i32, i32)
-
-define i64 @test_bzhi_i64(i64 %a0, i64 %a1, i64 *%a2) {
-; GENERIC-LABEL: test_bzhi_i64:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    bzhiq %rdi, (%rdx), %rcx # sched: [6:1.00]
-; GENERIC-NEXT:    bzhiq %rdi, %rsi, %rax # sched: [1:1.00]
-; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_bzhi_i64:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    bzhiq %rdi, (%rdx), %rcx # sched: [6:0.50]
-; HASWELL-NEXT:    bzhiq %rdi, %rsi, %rax # sched: [1:0.50]
-; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_bzhi_i64:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    bzhiq %rdi, (%rdx), %rcx # sched: [6:0.50]
-; BROADWELL-NEXT:    bzhiq %rdi, %rsi, %rax # sched: [1:0.50]
-; BROADWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_bzhi_i64:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    bzhiq %rdi, (%rdx), %rcx # sched: [6:0.50]
-; SKYLAKE-NEXT:    bzhiq %rdi, %rsi, %rax # sched: [1:0.50]
-; SKYLAKE-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: test_bzhi_i64:
-; KNL:       # %bb.0:
-; KNL-NEXT:    bzhiq %rdi, (%rdx), %rcx # sched: [6:0.50]
-; KNL-NEXT:    bzhiq %rdi, %rsi, %rax # sched: [1:0.50]
-; KNL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; ZNVER1-LABEL: test_bzhi_i64:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    bzhiq %rdi, (%rdx), %rcx # sched: [5:0.50]
-; ZNVER1-NEXT:    bzhiq %rdi, %rsi, %rax # sched: [1:0.25]
-; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = load i64, i64 *%a2
-  %2 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %1, i64 %a0)
-  %3 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %a1, i64 %a0)
-  %4 = add i64 %2, %3
-  ret i64 %4
-}
-declare i64 @llvm.x86.bmi.bzhi.64(i64, i64)
-
-define void @test_mulx_i32(i32 %a0, i32 %a1, i32* %a2) optsize {
-; GENERIC-LABEL: test_mulx_i32:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:    mulxl %esi, %esi, %edi # sched: [4:1.00]
-; GENERIC-NEXT:    mulxl (%rdx), %esi, %edi # sched: [9:1.00]
-; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_mulx_i32:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    #APP
-; HASWELL-NEXT:    mulxl %esi, %esi, %edi # sched: [4:1.00]
-; HASWELL-NEXT:    mulxl (%rdx), %esi, %edi # sched: [9:1.00]
-; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_mulx_i32:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    #APP
-; BROADWELL-NEXT:    mulxl %esi, %esi, %edi # sched: [4:1.00]
-; BROADWELL-NEXT:    mulxl (%rdx), %esi, %edi # sched: [9:1.00]
-; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_mulx_i32:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    #APP
-; SKYLAKE-NEXT:    mulxl %esi, %esi, %edi # sched: [4:1.00]
-; SKYLAKE-NEXT:    mulxl (%rdx), %esi, %edi # sched: [9:1.00]
-; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: test_mulx_i32:
-; KNL:       # %bb.0:
-; KNL-NEXT:    #APP
-; KNL-NEXT:    mulxl %esi, %esi, %edi # sched: [4:1.00]
-; KNL-NEXT:    mulxl (%rdx), %esi, %edi # sched: [9:1.00]
-; KNL-NEXT:    #NO_APP
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; ZNVER1-LABEL: test_mulx_i32:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    #APP
-; ZNVER1-NEXT:    mulxl %esi, %esi, %edi # sched: [3:2.00]
-; ZNVER1-NEXT:    mulxl (%rdx), %esi, %edi # sched: [8:2.00]
-; ZNVER1-NEXT:    #NO_APP
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  tail call void asm "mulx $1, $1, $0 \0A\09 mulx $2, $1, $0 ", "r,r,*m"(i32 %a0, i32 %a1, i32* %a2) nounwind
-  ret void
-}
-
-define void @test_mulx_i64(i64 %a0, i64 %a1, i64 *%a2) {
-; GENERIC-LABEL: test_mulx_i64:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    #APP
-; GENERIC-NEXT:    mulxq %rsi, %rsi, %rdi # sched: [4:1.00]
-; GENERIC-NEXT:    mulxq (%rdx), %rsi, %rdi # sched: [9:1.00]
-; GENERIC-NEXT:    #NO_APP
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_mulx_i64:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    #APP
-; HASWELL-NEXT:    mulxq %rsi, %rsi, %rdi # sched: [4:1.00]
-; HASWELL-NEXT:    mulxq (%rdx), %rsi, %rdi # sched: [9:1.00]
-; HASWELL-NEXT:    #NO_APP
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_mulx_i64:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    #APP
-; BROADWELL-NEXT:    mulxq %rsi, %rsi, %rdi # sched: [4:1.00]
-; BROADWELL-NEXT:    mulxq (%rdx), %rsi, %rdi # sched: [9:1.00]
-; BROADWELL-NEXT:    #NO_APP
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_mulx_i64:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    #APP
-; SKYLAKE-NEXT:    mulxq %rsi, %rsi, %rdi # sched: [4:1.00]
-; SKYLAKE-NEXT:    mulxq (%rdx), %rsi, %rdi # sched: [9:1.00]
-; SKYLAKE-NEXT:    #NO_APP
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: test_mulx_i64:
-; KNL:       # %bb.0:
-; KNL-NEXT:    #APP
-; KNL-NEXT:    mulxq %rsi, %rsi, %rdi # sched: [4:1.00]
-; KNL-NEXT:    mulxq (%rdx), %rsi, %rdi # sched: [9:1.00]
-; KNL-NEXT:    #NO_APP
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; ZNVER1-LABEL: test_mulx_i64:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    #APP
-; ZNVER1-NEXT:    mulxq %rsi, %rsi, %rdi # sched: [3:1.00]
-; ZNVER1-NEXT:    mulxq (%rdx), %rsi, %rdi # sched: [8:1.00]
-; ZNVER1-NEXT:    #NO_APP
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  tail call void asm "mulx $1, $1, $0 \0A\09 mulx $2, $1, $0 ", "r,r,*m"(i64 %a0, i64 %a1, i64* %a2) nounwind
-  ret void
-}
-
-define i32 @test_pdep_i32(i32 %a0, i32 %a1, i32 *%a2) {
-; GENERIC-LABEL: test_pdep_i32:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    pdepl (%rdx), %edi, %ecx # sched: [6:0.50]
-; GENERIC-NEXT:    pdepl %esi, %edi, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_pdep_i32:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    pdepl (%rdx), %edi, %ecx # sched: [8:1.00]
-; HASWELL-NEXT:    pdepl %esi, %edi, %eax # sched: [3:1.00]
-; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_pdep_i32:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    pdepl (%rdx), %edi, %ecx # sched: [8:1.00]
-; BROADWELL-NEXT:    pdepl %esi, %edi, %eax # sched: [3:1.00]
-; BROADWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_pdep_i32:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    pdepl (%rdx), %edi, %ecx # sched: [8:1.00]
-; SKYLAKE-NEXT:    pdepl %esi, %edi, %eax # sched: [3:1.00]
-; SKYLAKE-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: test_pdep_i32:
-; KNL:       # %bb.0:
-; KNL-NEXT:    pdepl (%rdx), %edi, %ecx # sched: [8:1.00]
-; KNL-NEXT:    pdepl %esi, %edi, %eax # sched: [3:1.00]
-; KNL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; ZNVER1-LABEL: test_pdep_i32:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    pdepl (%rdx), %edi, %ecx # sched: [100:0.25]
-; ZNVER1-NEXT:    pdepl %esi, %edi, %eax # sched: [100:0.25]
-; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = load i32, i32 *%a2
-  %2 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %a0, i32 %1)
-  %3 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %a0, i32 %a1)
-  %4 = add i32 %2, %3
-  ret i32 %4
-}
-declare i32 @llvm.x86.bmi.pdep.32(i32, i32)
-
-define i64 @test_pdep_i64(i64 %a0, i64 %a1, i64 *%a2) {
-; GENERIC-LABEL: test_pdep_i64:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    pdepq (%rdx), %rdi, %rcx # sched: [6:0.50]
-; GENERIC-NEXT:    pdepq %rsi, %rdi, %rax # sched: [1:0.33]
-; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_pdep_i64:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    pdepq (%rdx), %rdi, %rcx # sched: [8:1.00]
-; HASWELL-NEXT:    pdepq %rsi, %rdi, %rax # sched: [3:1.00]
-; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_pdep_i64:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    pdepq (%rdx), %rdi, %rcx # sched: [8:1.00]
-; BROADWELL-NEXT:    pdepq %rsi, %rdi, %rax # sched: [3:1.00]
-; BROADWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_pdep_i64:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    pdepq (%rdx), %rdi, %rcx # sched: [8:1.00]
-; SKYLAKE-NEXT:    pdepq %rsi, %rdi, %rax # sched: [3:1.00]
-; SKYLAKE-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: test_pdep_i64:
-; KNL:       # %bb.0:
-; KNL-NEXT:    pdepq (%rdx), %rdi, %rcx # sched: [8:1.00]
-; KNL-NEXT:    pdepq %rsi, %rdi, %rax # sched: [3:1.00]
-; KNL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; ZNVER1-LABEL: test_pdep_i64:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    pdepq (%rdx), %rdi, %rcx # sched: [100:0.25]
-; ZNVER1-NEXT:    pdepq %rsi, %rdi, %rax # sched: [100:0.25]
-; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = load i64, i64 *%a2
-  %2 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %a0, i64 %1)
-  %3 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %a0, i64 %a1)
-  %4 = add i64 %2, %3
-  ret i64 %4
-}
-declare i64 @llvm.x86.bmi.pdep.64(i64, i64)
-
-define i32 @test_pext_i32(i32 %a0, i32 %a1, i32 *%a2) {
-; GENERIC-LABEL: test_pext_i32:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    pextl (%rdx), %edi, %ecx # sched: [6:0.50]
-; GENERIC-NEXT:    pextl %esi, %edi, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_pext_i32:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    pextl (%rdx), %edi, %ecx # sched: [8:1.00]
-; HASWELL-NEXT:    pextl %esi, %edi, %eax # sched: [3:1.00]
-; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_pext_i32:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    pextl (%rdx), %edi, %ecx # sched: [8:1.00]
-; BROADWELL-NEXT:    pextl %esi, %edi, %eax # sched: [3:1.00]
-; BROADWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_pext_i32:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    pextl (%rdx), %edi, %ecx # sched: [8:1.00]
-; SKYLAKE-NEXT:    pextl %esi, %edi, %eax # sched: [3:1.00]
-; SKYLAKE-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: test_pext_i32:
-; KNL:       # %bb.0:
-; KNL-NEXT:    pextl (%rdx), %edi, %ecx # sched: [8:1.00]
-; KNL-NEXT:    pextl %esi, %edi, %eax # sched: [3:1.00]
-; KNL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; ZNVER1-LABEL: test_pext_i32:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    pextl (%rdx), %edi, %ecx # sched: [100:0.25]
-; ZNVER1-NEXT:    pextl %esi, %edi, %eax # sched: [100:0.25]
-; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = load i32, i32 *%a2
-  %2 = tail call i32 @llvm.x86.bmi.pext.32(i32 %a0, i32 %1)
-  %3 = tail call i32 @llvm.x86.bmi.pext.32(i32 %a0, i32 %a1)
-  %4 = add i32 %2, %3
-  ret i32 %4
-}
-declare i32 @llvm.x86.bmi.pext.32(i32, i32)
-
-define i64 @test_pext_i64(i64 %a0, i64 %a1, i64 *%a2) {
-; GENERIC-LABEL: test_pext_i64:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    pextq (%rdx), %rdi, %rcx # sched: [6:0.50]
-; GENERIC-NEXT:    pextq %rsi, %rdi, %rax # sched: [1:0.33]
-; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_pext_i64:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    pextq (%rdx), %rdi, %rcx # sched: [8:1.00]
-; HASWELL-NEXT:    pextq %rsi, %rdi, %rax # sched: [3:1.00]
-; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_pext_i64:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    pextq (%rdx), %rdi, %rcx # sched: [8:1.00]
-; BROADWELL-NEXT:    pextq %rsi, %rdi, %rax # sched: [3:1.00]
-; BROADWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_pext_i64:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    pextq (%rdx), %rdi, %rcx # sched: [8:1.00]
-; SKYLAKE-NEXT:    pextq %rsi, %rdi, %rax # sched: [3:1.00]
-; SKYLAKE-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: test_pext_i64:
-; KNL:       # %bb.0:
-; KNL-NEXT:    pextq (%rdx), %rdi, %rcx # sched: [8:1.00]
-; KNL-NEXT:    pextq %rsi, %rdi, %rax # sched: [3:1.00]
-; KNL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; ZNVER1-LABEL: test_pext_i64:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    pextq (%rdx), %rdi, %rcx # sched: [100:0.25]
-; ZNVER1-NEXT:    pextq %rsi, %rdi, %rax # sched: [100:0.25]
-; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = load i64, i64 *%a2
-  %2 = tail call i64 @llvm.x86.bmi.pext.64(i64 %a0, i64 %1)
-  %3 = tail call i64 @llvm.x86.bmi.pext.64(i64 %a0, i64 %a1)
-  %4 = add i64 %2, %3
-  ret i64 %4
-}
-declare i64 @llvm.x86.bmi.pext.64(i64, i64)
-
-define i32 @test_rorx_i32(i32 %a0, i32 %a1, i32 *%a2) {
-; GENERIC-LABEL: test_rorx_i32:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    rorxl $5, %edi, %ecx # sched: [1:0.50]
-; GENERIC-NEXT:    rorxl $5, (%rdx), %eax # sched: [6:0.50]
-; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_rorx_i32:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    rorxl $5, %edi, %ecx # sched: [1:0.50]
-; HASWELL-NEXT:    rorxl $5, (%rdx), %eax # sched: [6:0.50]
-; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_rorx_i32:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    rorxl $5, %edi, %ecx # sched: [1:0.50]
-; BROADWELL-NEXT:    rorxl $5, (%rdx), %eax # sched: [6:0.50]
-; BROADWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_rorx_i32:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    rorxl $5, %edi, %ecx # sched: [1:0.50]
-; SKYLAKE-NEXT:    rorxl $5, (%rdx), %eax # sched: [6:0.50]
-; SKYLAKE-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: test_rorx_i32:
-; KNL:       # %bb.0:
-; KNL-NEXT:    rorxl $5, %edi, %ecx # sched: [1:0.50]
-; KNL-NEXT:    rorxl $5, (%rdx), %eax # sched: [6:0.50]
-; KNL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; ZNVER1-LABEL: test_rorx_i32:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    rorxl $5, (%rdx), %eax # sched: [5:0.50]
-; ZNVER1-NEXT:    rorxl $5, %edi, %ecx # sched: [1:0.25]
-; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = load i32, i32 *%a2
-  %2 = lshr i32 %a0, 5
-  %3 = shl i32 %a0, 27
-  %4 = or i32 %2, %3
-  %5 = lshr i32 %1, 5
-  %6 = shl i32 %1, 27
-  %7 = or i32 %5, %6
-  %8 = add i32 %4, %7
-  ret i32 %8
-}
-
-define i64 @test_rorx_i64(i64 %a0, i64 %a1, i64 *%a2) {
-; GENERIC-LABEL: test_rorx_i64:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    rorxq $5, %rdi, %rcx # sched: [1:0.50]
-; GENERIC-NEXT:    rorxq $5, (%rdx), %rax # sched: [6:0.50]
-; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_rorx_i64:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    rorxq $5, %rdi, %rcx # sched: [1:0.50]
-; HASWELL-NEXT:    rorxq $5, (%rdx), %rax # sched: [6:0.50]
-; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_rorx_i64:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    rorxq $5, %rdi, %rcx # sched: [1:0.50]
-; BROADWELL-NEXT:    rorxq $5, (%rdx), %rax # sched: [6:0.50]
-; BROADWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_rorx_i64:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    rorxq $5, %rdi, %rcx # sched: [1:0.50]
-; SKYLAKE-NEXT:    rorxq $5, (%rdx), %rax # sched: [6:0.50]
-; SKYLAKE-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: test_rorx_i64:
-; KNL:       # %bb.0:
-; KNL-NEXT:    rorxq $5, %rdi, %rcx # sched: [1:0.50]
-; KNL-NEXT:    rorxq $5, (%rdx), %rax # sched: [6:0.50]
-; KNL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; ZNVER1-LABEL: test_rorx_i64:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    rorxq $5, (%rdx), %rax # sched: [5:0.50]
-; ZNVER1-NEXT:    rorxq $5, %rdi, %rcx # sched: [1:0.25]
-; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = load i64, i64 *%a2
-  %2 = lshr i64 %a0, 5
-  %3 = shl i64 %a0, 59
-  %4 = or i64 %2, %3
-  %5 = lshr i64 %1, 5
-  %6 = shl i64 %1, 59
-  %7 = or i64 %5, %6
-  %8 = add i64 %4, %7
-  ret i64 %8
-}
-
-define i32 @test_sarx_i32(i32 %a0, i32 %a1, i32 *%a2) {
-; GENERIC-LABEL: test_sarx_i32:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    sarxl %esi, %edi, %ecx # sched: [1:0.50]
-; GENERIC-NEXT:    sarxl %esi, (%rdx), %eax # sched: [6:0.50]
-; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_sarx_i32:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    sarxl %esi, %edi, %ecx # sched: [1:0.50]
-; HASWELL-NEXT:    sarxl %esi, (%rdx), %eax # sched: [6:0.50]
-; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_sarx_i32:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    sarxl %esi, %edi, %ecx # sched: [1:0.50]
-; BROADWELL-NEXT:    sarxl %esi, (%rdx), %eax # sched: [6:0.50]
-; BROADWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_sarx_i32:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    sarxl %esi, %edi, %ecx # sched: [1:0.50]
-; SKYLAKE-NEXT:    sarxl %esi, (%rdx), %eax # sched: [6:0.50]
-; SKYLAKE-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: test_sarx_i32:
-; KNL:       # %bb.0:
-; KNL-NEXT:    sarxl %esi, %edi, %ecx # sched: [1:0.50]
-; KNL-NEXT:    sarxl %esi, (%rdx), %eax # sched: [6:0.50]
-; KNL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; ZNVER1-LABEL: test_sarx_i32:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    sarxl %esi, (%rdx), %eax # sched: [5:0.50]
-; ZNVER1-NEXT:    sarxl %esi, %edi, %ecx # sched: [1:0.25]
-; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = load i32, i32 *%a2
-  %2 = ashr i32 %a0, %a1
-  %3 = ashr i32 %1, %a1
-  %4 = add i32 %2, %3
-  ret i32 %4
-}
-
-define i64 @test_sarx_i64(i64 %a0, i64 %a1, i64 *%a2) {
-; GENERIC-LABEL: test_sarx_i64:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    sarxq %rsi, %rdi, %rcx # sched: [1:0.50]
-; GENERIC-NEXT:    sarxq %rsi, (%rdx), %rax # sched: [6:0.50]
-; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_sarx_i64:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    sarxq %rsi, %rdi, %rcx # sched: [1:0.50]
-; HASWELL-NEXT:    sarxq %rsi, (%rdx), %rax # sched: [6:0.50]
-; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_sarx_i64:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    sarxq %rsi, %rdi, %rcx # sched: [1:0.50]
-; BROADWELL-NEXT:    sarxq %rsi, (%rdx), %rax # sched: [6:0.50]
-; BROADWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_sarx_i64:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    sarxq %rsi, %rdi, %rcx # sched: [1:0.50]
-; SKYLAKE-NEXT:    sarxq %rsi, (%rdx), %rax # sched: [6:0.50]
-; SKYLAKE-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: test_sarx_i64:
-; KNL:       # %bb.0:
-; KNL-NEXT:    sarxq %rsi, %rdi, %rcx # sched: [1:0.50]
-; KNL-NEXT:    sarxq %rsi, (%rdx), %rax # sched: [6:0.50]
-; KNL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; ZNVER1-LABEL: test_sarx_i64:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    sarxq %rsi, (%rdx), %rax # sched: [5:0.50]
-; ZNVER1-NEXT:    sarxq %rsi, %rdi, %rcx # sched: [1:0.25]
-; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = load i64, i64 *%a2
-  %2 = ashr i64 %a0, %a1
-  %3 = ashr i64 %1, %a1
-  %4 = add i64 %2, %3
-  ret i64 %4
-}
-
-define i32 @test_shlx_i32(i32 %a0, i32 %a1, i32 *%a2) {
-; GENERIC-LABEL: test_shlx_i32:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    shlxl %esi, %edi, %ecx # sched: [1:0.50]
-; GENERIC-NEXT:    shlxl %esi, (%rdx), %eax # sched: [6:0.50]
-; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_shlx_i32:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    shlxl %esi, %edi, %ecx # sched: [1:0.50]
-; HASWELL-NEXT:    shlxl %esi, (%rdx), %eax # sched: [6:0.50]
-; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_shlx_i32:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    shlxl %esi, %edi, %ecx # sched: [1:0.50]
-; BROADWELL-NEXT:    shlxl %esi, (%rdx), %eax # sched: [6:0.50]
-; BROADWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_shlx_i32:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    shlxl %esi, %edi, %ecx # sched: [1:0.50]
-; SKYLAKE-NEXT:    shlxl %esi, (%rdx), %eax # sched: [6:0.50]
-; SKYLAKE-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: test_shlx_i32:
-; KNL:       # %bb.0:
-; KNL-NEXT:    shlxl %esi, %edi, %ecx # sched: [1:0.50]
-; KNL-NEXT:    shlxl %esi, (%rdx), %eax # sched: [6:0.50]
-; KNL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; ZNVER1-LABEL: test_shlx_i32:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    shlxl %esi, (%rdx), %eax # sched: [5:0.50]
-; ZNVER1-NEXT:    shlxl %esi, %edi, %ecx # sched: [1:0.25]
-; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = load i32, i32 *%a2
-  %2 = shl i32 %a0, %a1
-  %3 = shl i32 %1, %a1
-  %4 = add i32 %2, %3
-  ret i32 %4
-}
-
-define i64 @test_shlx_i64(i64 %a0, i64 %a1, i64 *%a2) {
-; GENERIC-LABEL: test_shlx_i64:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    shlxq %rsi, %rdi, %rcx # sched: [1:0.50]
-; GENERIC-NEXT:    shlxq %rsi, (%rdx), %rax # sched: [6:0.50]
-; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_shlx_i64:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    shlxq %rsi, %rdi, %rcx # sched: [1:0.50]
-; HASWELL-NEXT:    shlxq %rsi, (%rdx), %rax # sched: [6:0.50]
-; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_shlx_i64:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    shlxq %rsi, %rdi, %rcx # sched: [1:0.50]
-; BROADWELL-NEXT:    shlxq %rsi, (%rdx), %rax # sched: [6:0.50]
-; BROADWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_shlx_i64:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    shlxq %rsi, %rdi, %rcx # sched: [1:0.50]
-; SKYLAKE-NEXT:    shlxq %rsi, (%rdx), %rax # sched: [6:0.50]
-; SKYLAKE-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: test_shlx_i64:
-; KNL:       # %bb.0:
-; KNL-NEXT:    shlxq %rsi, %rdi, %rcx # sched: [1:0.50]
-; KNL-NEXT:    shlxq %rsi, (%rdx), %rax # sched: [6:0.50]
-; KNL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; ZNVER1-LABEL: test_shlx_i64:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    shlxq %rsi, (%rdx), %rax # sched: [5:0.50]
-; ZNVER1-NEXT:    shlxq %rsi, %rdi, %rcx # sched: [1:0.25]
-; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = load i64, i64 *%a2
-  %2 = shl i64 %a0, %a1
-  %3 = shl i64 %1, %a1
-  %4 = add i64 %2, %3
-  ret i64 %4
-}
-
-define i32 @test_shrx_i32(i32 %a0, i32 %a1, i32 *%a2) {
-; GENERIC-LABEL: test_shrx_i32:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    shrxl %esi, %edi, %ecx # sched: [1:0.50]
-; GENERIC-NEXT:    shrxl %esi, (%rdx), %eax # sched: [6:0.50]
-; GENERIC-NEXT:    addl %ecx, %eax # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_shrx_i32:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    shrxl %esi, %edi, %ecx # sched: [1:0.50]
-; HASWELL-NEXT:    shrxl %esi, (%rdx), %eax # sched: [6:0.50]
-; HASWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_shrx_i32:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    shrxl %esi, %edi, %ecx # sched: [1:0.50]
-; BROADWELL-NEXT:    shrxl %esi, (%rdx), %eax # sched: [6:0.50]
-; BROADWELL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_shrx_i32:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    shrxl %esi, %edi, %ecx # sched: [1:0.50]
-; SKYLAKE-NEXT:    shrxl %esi, (%rdx), %eax # sched: [6:0.50]
-; SKYLAKE-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: test_shrx_i32:
-; KNL:       # %bb.0:
-; KNL-NEXT:    shrxl %esi, %edi, %ecx # sched: [1:0.50]
-; KNL-NEXT:    shrxl %esi, (%rdx), %eax # sched: [6:0.50]
-; KNL-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; ZNVER1-LABEL: test_shrx_i32:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    shrxl %esi, (%rdx), %eax # sched: [5:0.50]
-; ZNVER1-NEXT:    shrxl %esi, %edi, %ecx # sched: [1:0.25]
-; ZNVER1-NEXT:    addl %ecx, %eax # sched: [1:0.25]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = load i32, i32 *%a2
-  %2 = lshr i32 %a0, %a1
-  %3 = lshr i32 %1, %a1
-  %4 = add i32 %2, %3
-  ret i32 %4
-}
-
-define i64 @test_shrx_i64(i64 %a0, i64 %a1, i64 *%a2) {
-; GENERIC-LABEL: test_shrx_i64:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    shrxq %rsi, %rdi, %rcx # sched: [1:0.50]
-; GENERIC-NEXT:    shrxq %rsi, (%rdx), %rax # sched: [6:0.50]
-; GENERIC-NEXT:    addq %rcx, %rax # sched: [1:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; HASWELL-LABEL: test_shrx_i64:
-; HASWELL:       # %bb.0:
-; HASWELL-NEXT:    shrxq %rsi, %rdi, %rcx # sched: [1:0.50]
-; HASWELL-NEXT:    shrxq %rsi, (%rdx), %rax # sched: [6:0.50]
-; HASWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; HASWELL-NEXT:    retq # sched: [7:1.00]
-;
-; BROADWELL-LABEL: test_shrx_i64:
-; BROADWELL:       # %bb.0:
-; BROADWELL-NEXT:    shrxq %rsi, %rdi, %rcx # sched: [1:0.50]
-; BROADWELL-NEXT:    shrxq %rsi, (%rdx), %rax # sched: [6:0.50]
-; BROADWELL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; BROADWELL-NEXT:    retq # sched: [7:1.00]
-;
-; SKYLAKE-LABEL: test_shrx_i64:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    shrxq %rsi, %rdi, %rcx # sched: [1:0.50]
-; SKYLAKE-NEXT:    shrxq %rsi, (%rdx), %rax # sched: [6:0.50]
-; SKYLAKE-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; KNL-LABEL: test_shrx_i64:
-; KNL:       # %bb.0:
-; KNL-NEXT:    shrxq %rsi, %rdi, %rcx # sched: [1:0.50]
-; KNL-NEXT:    shrxq %rsi, (%rdx), %rax # sched: [6:0.50]
-; KNL-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; KNL-NEXT:    retq # sched: [7:1.00]
-;
-; ZNVER1-LABEL: test_shrx_i64:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    shrxq %rsi, (%rdx), %rax # sched: [5:0.50]
-; ZNVER1-NEXT:    shrxq %rsi, %rdi, %rcx # sched: [1:0.25]
-; ZNVER1-NEXT:    addq %rcx, %rax # sched: [1:0.25]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  %1 = load i64, i64 *%a2
-  %2 = lshr i64 %a0, %a1
-  %3 = lshr i64 %1, %a1
-  %4 = add i64 %2, %3
-  ret i64 %4
-}

Removed: llvm/trunk/test/CodeGen/X86/clflushopt-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/clflushopt-schedule.ll?rev=353042&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/clflushopt-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/clflushopt-schedule.ll (removed)
@@ -1,36 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+clflushopt | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=goldmont | FileCheck %s --check-prefix=CHECK --check-prefix=GLM
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=SKYLAKE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
-
-define void @clflushopt(i8* %p) nounwind {
-; GENERIC-LABEL: clflushopt:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    clflushopt (%rdi) # sched: [5:1.00]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; GLM-LABEL: clflushopt:
-; GLM:       # %bb.0:
-; GLM-NEXT:    clflushopt (%rdi) # sched: [3:1.00]
-; GLM-NEXT:    retq # sched: [4:1.00]
-;
-; SKYLAKE-LABEL: clflushopt:
-; SKYLAKE:       # %bb.0:
-; SKYLAKE-NEXT:    clflushopt (%rdi) # sched: [2:1.00]
-; SKYLAKE-NEXT:    retq # sched: [7:1.00]
-;
-; SKX-LABEL: clflushopt:
-; SKX:       # %bb.0:
-; SKX-NEXT:    clflushopt (%rdi) # sched: [2:1.00]
-; SKX-NEXT:    retq # sched: [7:1.00]
-;
-; ZNVER1-LABEL: clflushopt:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    clflushopt (%rdi) # sched: [8:0.50]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  tail call void @llvm.x86.clflushopt(i8* %p)
-  ret void
-}
-declare void @llvm.x86.clflushopt(i8*) nounwind

Removed: llvm/trunk/test/CodeGen/X86/clwb-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/clwb-schedule.ll?rev=353042&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/clwb-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/clwb-schedule.ll (removed)
@@ -1,18 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+clwb | FileCheck %s --check-prefix=GENERIC
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=SKX
-
-define void @clwb(i8* %a0) nounwind {
-; GENERIC-LABEL: clwb:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    clwb (%rdi) # sched: [5:0.50]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; SKX-LABEL: clwb:
-; SKX:       # %bb.0:
-; SKX-NEXT:    clwb (%rdi) # sched: [5:0.50]
-; SKX-NEXT:    retq # sched: [7:1.00]
-  tail call void @llvm.x86.clwb(i8* %a0)
-  ret void
-}
-declare void @llvm.x86.clwb(i8*) nounwind

Removed: llvm/trunk/test/CodeGen/X86/clzero-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/clzero-schedule.ll?rev=353042&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/clzero-schedule.ll (original)
+++ llvm/trunk/test/CodeGen/X86/clzero-schedule.ll (removed)
@@ -1,20 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+clzero | FileCheck %s --check-prefix=GENERIC
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=ZNVER1
-
-define void @test_clzero(i8* %p) {
-; GENERIC-LABEL: test_clzero:
-; GENERIC:       # %bb.0:
-; GENERIC-NEXT:    leaq (%rdi), %rax # sched: [1:0.50]
-; GENERIC-NEXT:    clzero # sched: [100:0.33]
-; GENERIC-NEXT:    retq # sched: [1:1.00]
-;
-; ZNVER1-LABEL: test_clzero:
-; ZNVER1:       # %bb.0:
-; ZNVER1-NEXT:    leaq (%rdi), %rax # sched: [1:0.25]
-; ZNVER1-NEXT:    clzero # sched: [100:0.25]
-; ZNVER1-NEXT:    retq # sched: [1:0.50]
-  tail call void @llvm.x86.clzero(i8* %p)
-  ret void
-}
-declare void @llvm.x86.clzero(i8*)




More information about the llvm-commits mailing list