[llvm] r314594 - [X86][SKX] Added codegen regression test for avx512 instructions scheduling.NFC.

Gadi Haber via llvm-commits llvm-commits at lists.llvm.org
Sat Sep 30 07:30:23 PDT 2017


Author: gadi.haber
Date: Sat Sep 30 07:30:23 2017
New Revision: 314594

URL: http://llvm.org/viewvc/llvm-project?rev=314594&view=rev
Log:
[X86][SKX] Added codegen regression test for avx512 instructions scheduling.NFC.

NFC.
 Added code gen regression tests for avx512 instructions scheduling called avx512-schedule.ll and
 avx512-shuffle-schedule.ll.
 This patch is in preparation of a larger patch of adding all SKX instruction scheduling and therefore
 the scheduling for the avx512 instructions are still missing.

Reviewers: zvi, delena, RKSimon, igorb
Differential Revision: https://reviews.llvm.org/D38035

Change-Id: I792762763127a921b9e13684b58af03646536533

Added:
    llvm/trunk/test/CodeGen/X86/avx512-schedule.ll   (with props)
    llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll   (with props)

Added: llvm/trunk/test/CodeGen/X86/avx512-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-schedule.ll?rev=314594&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-schedule.ll (added)
+++ llvm/trunk/test/CodeGen/X86/avx512-schedule.ll Sat Sep 30 07:30:23 2017
@@ -0,0 +1,7222 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK
+; This test is an assembly of avx512 instructions to check their scheduling
+
+define <8 x double> @addpd512(<8 x double> %y, <8 x double> %x) {
+; CHECK-LABEL: addpd512:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %add.i = fadd <8 x double> %x, %y
+  ret <8 x double> %add.i
+}
+
+define <8 x double> @addpd512fold(<8 x double> %y) {
+; CHECK-LABEL: addpd512fold:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vaddpd {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %add.i = fadd <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.800000e+00, double 2.300000e+00, double 1.200000e+00>
+  ret <8 x double> %add.i
+}
+
+define <16 x float> @addps512(<16 x float> %y, <16 x float> %x) {
+; CHECK-LABEL: addps512:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %add.i = fadd <16 x float> %x, %y
+  ret <16 x float> %add.i
+}
+
+define <16 x float> @addps512fold(<16 x float> %y) {
+; CHECK-LABEL: addps512fold:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vaddps {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %add.i = fadd <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 4.500000e+00, float 4.500000e+00, float 0x400B333340000000,  float 0x4002666660000000, float 0x3FF3333340000000>
+  ret <16 x float> %add.i
+}
+
+define <8 x double> @subpd512(<8 x double> %y, <8 x double> %x) {
+; CHECK-LABEL: subpd512:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vsubpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %sub.i = fsub <8 x double> %x, %y
+  ret <8 x double> %sub.i
+}
+
+define <8 x double> @subpd512fold(<8 x double> %y, <8 x double>* %x) {
+; CHECK-LABEL: subpd512fold:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vsubpd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %tmp2 = load <8 x double>, <8 x double>* %x, align 8
+  %sub.i = fsub <8 x double> %y, %tmp2
+  ret <8 x double> %sub.i
+}
+
+define <16 x float> @subps512(<16 x float> %y, <16 x float> %x) {
+; CHECK-LABEL: subps512:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vsubps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %sub.i = fsub <16 x float> %x, %y
+  ret <16 x float> %sub.i
+}
+
+define <16 x float> @subps512fold(<16 x float> %y, <16 x float>* %x) {
+; CHECK-LABEL: subps512fold:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vsubps (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %tmp2 = load <16 x float>, <16 x float>* %x, align 4
+  %sub.i = fsub <16 x float> %y, %tmp2
+  ret <16 x float> %sub.i
+}
+
+define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) {
+; CHECK-LABEL: imulq512:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmullq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: imulq512:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpmullq %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    retq
+  %z = mul <8 x i64>%x, %y
+  ret <8 x i64>%z
+}
+
+define <4 x i64> @imulq256(<4 x i64> %y, <4 x i64> %x) {
+; CHECK-LABEL: imulq256:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmullq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: imulq256:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpmullq %ymm0, %ymm1, %ymm0
+; SKX-NEXT:    retq
+  %z = mul <4 x i64>%x, %y
+  ret <4 x i64>%z
+}
+
+define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) {
+; CHECK-LABEL: imulq128:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmullq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: imulq128:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpmullq %xmm0, %xmm1, %xmm0
+; SKX-NEXT:    retq
+  %z = mul <2 x i64>%x, %y
+  ret <2 x i64>%z
+}
+
+define <8 x double> @mulpd512(<8 x double> %y, <8 x double> %x) {
+; CHECK-LABEL: mulpd512:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %mul.i = fmul <8 x double> %x, %y
+  ret <8 x double> %mul.i
+}
+
+define <8 x double> @mulpd512fold(<8 x double> %y) {
+; CHECK-LABEL: mulpd512fold:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vmulpd {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %mul.i = fmul <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00>
+  ret <8 x double> %mul.i
+}
+
+define <16 x float> @mulps512(<16 x float> %y, <16 x float> %x) {
+; CHECK-LABEL: mulps512:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vmulps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %mul.i = fmul <16 x float> %x, %y
+  ret <16 x float> %mul.i
+}
+
+define <16 x float> @mulps512fold(<16 x float> %y) {
+; CHECK-LABEL: mulps512fold:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vmulps {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %mul.i = fmul <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000>
+  ret <16 x float> %mul.i
+}
+
+define <8 x double> @divpd512(<8 x double> %y, <8 x double> %x) {
+; CHECK-LABEL: divpd512:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vdivpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %div.i = fdiv <8 x double> %x, %y
+  ret <8 x double> %div.i
+}
+
+define <8 x double> @divpd512fold(<8 x double> %y) {
+; CHECK-LABEL: divpd512fold:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vdivpd {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %div.i = fdiv <8 x double> %y, <double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00, double 4.500000e+00, double 3.400000e+00, double 2.300000e+00, double 1.200000e+00>
+  ret <8 x double> %div.i
+}
+
+define <16 x float> @divps512(<16 x float> %y, <16 x float> %x) {
+; CHECK-LABEL: divps512:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vdivps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %div.i = fdiv <16 x float> %x, %y
+  ret <16 x float> %div.i
+}
+
+define <16 x float> @divps512fold(<16 x float> %y) {
+; CHECK-LABEL: divps512fold:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vdivps {{.*}}(%rip), %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %div.i = fdiv <16 x float> %y, <float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 4.500000e+00, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 0x400B333340000000, float 0x4002666660000000, float 0x3FF3333340000000, float 4.500000e+00, float 4.500000e+00, float 0x4002666660000000, float 0x3FF3333340000000>
+  ret <16 x float> %div.i
+}
+
+define <8 x i64> @vpaddq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone {
+; CHECK-LABEL: vpaddq_test:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %x = add <8 x i64> %i, %j
+  ret <8 x i64> %x
+}
+
+define <8 x i64> @vpaddq_fold_test(<8 x i64> %i, <8 x i64>* %j) nounwind {
+; CHECK-LABEL: vpaddq_fold_test:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpaddq (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %tmp = load <8 x i64>, <8 x i64>* %j, align 4
+  %x = add <8 x i64> %i, %tmp
+  ret <8 x i64> %x
+}
+
+define <8 x i64> @vpaddq_broadcast_test(<8 x i64> %i) nounwind {
+; CHECK-LABEL: vpaddq_broadcast_test:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %x = add <8 x i64> %i, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  ret <8 x i64> %x
+}
+
+define <8 x i64> @vpaddq_broadcast2_test(<8 x i64> %i, i64* %j) nounwind {
+; CHECK-LABEL: vpaddq_broadcast2_test:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpaddq (%rdi){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %tmp = load i64, i64* %j
+  %j.0 = insertelement <8 x i64> undef, i64 %tmp, i32 0
+  %j.1 = insertelement <8 x i64> %j.0, i64 %tmp, i32 1
+  %j.2 = insertelement <8 x i64> %j.1, i64 %tmp, i32 2
+  %j.3 = insertelement <8 x i64> %j.2, i64 %tmp, i32 3
+  %j.4 = insertelement <8 x i64> %j.3, i64 %tmp, i32 4
+  %j.5 = insertelement <8 x i64> %j.4, i64 %tmp, i32 5
+  %j.6 = insertelement <8 x i64> %j.5, i64 %tmp, i32 6
+  %j.7 = insertelement <8 x i64> %j.6, i64 %tmp, i32 7
+  %x = add <8 x i64> %i, %j.7
+  ret <8 x i64> %x
+}
+
+define <16 x i32> @vpaddd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone {
+; CHECK-LABEL: vpaddd_test:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %x = add <16 x i32> %i, %j
+  ret <16 x i32> %x
+}
+
+define <16 x i32> @vpaddd_fold_test(<16 x i32> %i, <16 x i32>* %j) nounwind {
+; CHECK-LABEL: vpaddd_fold_test:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %tmp = load <16 x i32>, <16 x i32>* %j, align 4
+  %x = add <16 x i32> %i, %tmp
+  ret <16 x i32> %x
+}
+
+define <16 x i32> @vpaddd_broadcast_test(<16 x i32> %i) nounwind {
+; CHECK-LABEL: vpaddd_broadcast_test:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %x = add <16 x i32> %i, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  ret <16 x i32> %x
+}
+
+define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
+; CHECK-LABEL: vpaddd_mask_test:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %x = add <16 x i32> %i, %j
+  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
+  ret <16 x i32> %r
+}
+
+define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
+; CHECK-LABEL: vpaddd_maskz_test:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %x = add <16 x i32> %i, %j
+  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+  ret <16 x i32> %r
+}
+
+define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
+; CHECK-LABEL: vpaddd_mask_fold_test:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %j = load <16 x i32>, <16 x i32>* %j.ptr
+  %x = add <16 x i32> %i, %j
+  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
+  ret <16 x i32> %r
+}
+
+define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
+; CHECK-LABEL: vpaddd_mask_broadcast_test:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %x = add <16 x i32> %i, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
+  ret <16 x i32> %r
+}
+
+define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
+; CHECK-LABEL: vpaddd_maskz_fold_test:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %j = load <16 x i32>, <16 x i32>* %j.ptr
+  %x = add <16 x i32> %i, %j
+  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+  ret <16 x i32> %r
+}
+
+define <16 x i32> @vpaddd_maskz_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
+; CHECK-LABEL: vpaddd_maskz_broadcast_test:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %x = add <16 x i32> %i, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+  ret <16 x i32> %r
+}
+
+define <8 x i64> @vpsubq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone {
+; CHECK-LABEL: vpsubq_test:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %x = sub <8 x i64> %i, %j
+  ret <8 x i64> %x
+}
+
+define <16 x i32> @vpsubd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone {
+; CHECK-LABEL: vpsubd_test:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %x = sub <16 x i32> %i, %j
+  ret <16 x i32> %x
+}
+
+define <16 x i32> @vpmulld_test(<16 x i32> %i, <16 x i32> %j) {
+; CHECK-LABEL: vpmulld_test:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %x = mul <16 x i32> %i, %j
+  ret <16 x i32> %x
+}
+
+declare float @sqrtf(float) readnone
+define float @sqrtA(float %a) nounwind uwtable readnone ssp {
+; CHECK-LABEL: sqrtA:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # sched: [12:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %conv1 = tail call float @sqrtf(float %a) nounwind readnone
+  ret float %conv1
+}
+
+declare double @sqrt(double) readnone
+define double @sqrtB(double %a) nounwind uwtable readnone ssp {
+; CHECK-LABEL: sqrtB:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [18:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %call = tail call double @sqrt(double %a) nounwind readnone
+  ret double %call
+}
+
+declare float @llvm.sqrt.f32(float)
+define float @sqrtC(float %a) nounwind {
+; CHECK-LABEL: sqrtC:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # sched: [12:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = call float @llvm.sqrt.f32(float %a)
+  ret float %b
+}
+
+declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
+define <16 x float> @sqrtD(<16 x float> %a) nounwind {
+; CHECK-LABEL: sqrtD:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vsqrtps %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a)
+  ret <16 x float> %b
+}
+
+declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
+define <8 x double> @sqrtE(<8 x double> %a) nounwind {
+; CHECK-LABEL: sqrtE:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vsqrtpd %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a)
+  ret <8 x double> %b
+}
+
+define <16 x float> @fadd_broadcast(<16 x float> %a) nounwind {
+; CHECK-LABEL: fadd_broadcast:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = fadd <16 x float> %a, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
+  ret <16 x float> %b
+}
+
+define <8 x i64> @addq_broadcast(<8 x i64> %a) nounwind {
+; CHECK-LABEL: addq_broadcast:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = add <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  ret <8 x i64> %b
+}
+
+define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind {
+; CHECK-LABEL: orq_broadcast:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vorpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: orq_broadcast:
+; SKX:       # BB#0:
+; SKX-NEXT:    vorpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %b = or <8 x i64> %a, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
+  ret <8 x i64> %b
+}
+
+define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) {
+; CHECK-LABEL: andd512fold:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vandps (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: andd512fold:
+; SKX:       # BB#0: # %entry
+; SKX-NEXT:    vandps (%rdi), %zmm0, %zmm0
+; SKX-NEXT:    retq
+entry:
+  %a = load <16 x i32>, <16 x i32>* %x, align 4
+  %b = and <16 x i32> %y, %a
+  ret <16 x i32> %b
+}
+
+define <8 x i64> @andqbrst(<8 x i64> %p1, i64* %ap) {
+; CHECK-LABEL: andqbrst:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vandpd (%rdi){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: andqbrst:
+; SKX:       # BB#0: # %entry
+; SKX-NEXT:    vandpd (%rdi){1to8}, %zmm0, %zmm0
+; SKX-NEXT:    retq
+entry:
+  %a = load i64, i64* %ap, align 8
+  %b = insertelement <8 x i64> undef, i64 %a, i32 0
+  %c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
+  %d = and <8 x i64> %p1, %c
+  ret <8 x i64>%d
+}
+
+define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i,
+; CHECK-LABEL: test_mask_vaddps:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vaddps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+                                     <16 x float> %j, <16 x i32> %mask1)
+                                     nounwind readnone {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %x = fadd <16 x float> %i, %j
+  %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst
+  ret <16 x float> %r
+}
+
+define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i,
+; CHECK-LABEL: test_mask_vmulps:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vmulps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+                                     <16 x float> %j, <16 x i32> %mask1)
+                                     nounwind readnone {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %x = fmul <16 x float> %i, %j
+  %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst
+  ret <16 x float> %r
+}
+
+define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i,
+; CHECK-LABEL: test_mask_vminps:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vminps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+                                     <16 x float> %j, <16 x i32> %mask1)
+                                     nounwind readnone {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %cmp_res = fcmp olt <16 x float> %i, %j
+  %min = select <16 x i1> %cmp_res, <16 x float> %i, <16 x float> %j
+  %r = select <16 x i1> %mask, <16 x float> %min, <16 x float> %dst
+  ret <16 x float> %r
+}
+
+define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i,
+; CHECK-LABEL: test_mask_vminpd:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1
+; CHECK-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_mask_vminpd:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; SKX-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1
+; SKX-NEXT:    vminpd %zmm2, %zmm1, %zmm0 {%k1}
+; SKX-NEXT:    retq
+                                     <8 x double> %j, <8 x i32> %mask1)
+                                     nounwind readnone {
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %cmp_res = fcmp olt <8 x double> %i, %j
+  %min = select <8 x i1> %cmp_res, <8 x double> %i, <8 x double> %j
+  %r = select <8 x i1> %mask, <8 x double> %min, <8 x double> %dst
+  ret <8 x double> %r
+}
+
+define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i,
+; CHECK-LABEL: test_mask_vmaxps:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vmaxps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+                                     <16 x float> %j, <16 x i32> %mask1)
+                                     nounwind readnone {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %cmp_res = fcmp ogt <16 x float> %i, %j
+  %max = select <16 x i1> %cmp_res, <16 x float> %i, <16 x float> %j
+  %r = select <16 x i1> %mask, <16 x float> %max, <16 x float> %dst
+  ret <16 x float> %r
+}
+
+define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i,
+; CHECK-LABEL: test_mask_vmaxpd:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1
+; CHECK-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_mask_vmaxpd:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; SKX-NEXT:    vpcmpneqd %ymm4, %ymm3, %k1
+; SKX-NEXT:    vmaxpd %zmm2, %zmm1, %zmm0 {%k1}
+; SKX-NEXT:    retq
+                                     <8 x double> %j, <8 x i32> %mask1)
+                                     nounwind readnone {
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %cmp_res = fcmp ogt <8 x double> %i, %j
+  %max = select <8 x i1> %cmp_res, <8 x double> %i, <8 x double> %j
+  %r = select <8 x i1> %mask, <8 x double> %max, <8 x double> %dst
+  ret <8 x double> %r
+}
+
+define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i,
+; CHECK-LABEL: test_mask_vsubps:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vsubps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+                                     <16 x float> %j, <16 x i32> %mask1)
+                                     nounwind readnone {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %x = fsub <16 x float> %i, %j
+  %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst
+  ret <16 x float> %r
+}
+
+define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i,
+; CHECK-LABEL: test_mask_vdivps:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqd %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vdivps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+                                     <16 x float> %j, <16 x i32> %mask1)
+                                     nounwind readnone {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %x = fdiv <16 x float> %i, %j
+  %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst
+  ret <16 x float> %r
+}
+
+define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i,
+; CHECK-LABEL: test_mask_vaddpd:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqq %zmm4, %zmm3, %k1
+; CHECK-NEXT:    vaddpd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+                                     <8 x double> %j, <8 x i64> %mask1)
+                                     nounwind readnone {
+  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+  %x = fadd <8 x double> %i, %j
+  %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> %dst
+  ret <8 x double> %r
+}
+
+define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j,
+; CHECK-LABEL: test_maskz_vaddpd:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqq %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vaddpd %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+                                      <8 x i64> %mask1) nounwind readnone {
+  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+  %x = fadd <8 x double> %i, %j
+  %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> zeroinitializer
+  ret <8 x double> %r
+}
+
+define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i,
+; CHECK-LABEL: test_mask_fold_vaddpd:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqq %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vaddpd (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+                                     <8 x double>* %j,  <8 x i64> %mask1)
+                                     nounwind {
+  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+  %tmp = load <8 x double>, <8 x double>* %j, align 8
+  %x = fadd <8 x double> %i, %tmp
+  %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> %dst
+  ret <8 x double> %r
+}
+
+define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j,
+; CHECK-LABEL: test_maskz_fold_vaddpd:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vaddpd (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+                                      <8 x i64> %mask1) nounwind {
+  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+  %tmp = load <8 x double>, <8 x double>* %j, align 8
+  %x = fadd <8 x double> %i, %tmp
+  %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> zeroinitializer
+  ret <8 x double> %r
+}
+
+define <8 x double> @test_broadcast_vaddpd(<8 x double> %i, double* %j) nounwind {
+; CHECK-LABEL: test_broadcast_vaddpd:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vaddpd (%rdi){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %tmp = load double, double* %j
+  %b = insertelement <8 x double> undef, double %tmp, i32 0
+  %c = shufflevector <8 x double> %b, <8 x double> undef,
+                     <8 x i32> zeroinitializer
+  %x = fadd <8 x double> %c, %i
+  ret <8 x double> %x
+}
+
+define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> %i,
+; CHECK-LABEL: test_mask_broadcast_vaddpd:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqq %zmm0, %zmm2, %k1
+; CHECK-NEXT:    vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1}
+; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+                                      double* %j, <8 x i64> %mask1) nounwind {
+  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+  %tmp = load double, double* %j
+  %b = insertelement <8 x double> undef, double %tmp, i32 0
+  %c = shufflevector <8 x double> %b, <8 x double> undef,
+                     <8 x i32> zeroinitializer
+  %x = fadd <8 x double> %c, %i
+  %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> %i
+  ret <8 x double> %r
+}
+
+define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j,
+; CHECK-LABEL: test_maskz_broadcast_vaddpd:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vaddpd (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+                                       <8 x i64> %mask1) nounwind {
+  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+  %tmp = load double, double* %j
+  %b = insertelement <8 x double> undef, double %tmp, i32 0
+  %c = shufflevector <8 x double> %b, <8 x double> undef,
+                     <8 x i32> zeroinitializer
+  %x = fadd <8 x double> %c, %i
+  %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> zeroinitializer
+  ret <8 x double> %r
+}
+
+define <16 x float>  @test_fxor(<16 x float> %a) {
+; CHECK-LABEL: test_fxor:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_fxor:
+; SKX:       # BB#0:
+; SKX-NEXT:    vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; SKX-NEXT:    retq
+
+  %res = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
+  ret <16 x float>%res
+}
+
+define <8 x float>  @test_fxor_8f32(<8 x float> %a) {
+; CHECK-LABEL: test_fxor_8f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vxorps {{.*}}(%rip){1to8}, %ymm0, %ymm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_fxor_8f32:
+; SKX:       # BB#0:
+; SKX-NEXT:    vxorps {{.*}}(%rip){1to8}, %ymm0, %ymm0
+; SKX-NEXT:    retq
+  %res = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
+  ret <8 x float>%res
+}
+
+define <8 x double> @fabs_v8f64(<8 x double> %p)
+; CHECK-LABEL: fabs_v8f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vandpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: fabs_v8f64:
+; SKX:       # BB#0:
+; SKX-NEXT:    vandpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; SKX-NEXT:    retq
+{
+  %t = call <8 x double> @llvm.fabs.v8f64(<8 x double> %p)
+  ret <8 x double> %t
+}
+declare <8 x double> @llvm.fabs.v8f64(<8 x double> %p)
+
+define <16 x float> @fabs_v16f32(<16 x float> %p)
+; CHECK-LABEL: fabs_v16f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vandps {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: fabs_v16f32:
+; SKX:       # BB#0:
+; SKX-NEXT:    vandps {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; SKX-NEXT:    retq
+{
+  %t = call <16 x float> @llvm.fabs.v16f32(<16 x float> %p)
+  ret <16 x float> %t
+}
+declare <16 x float> @llvm.fabs.v16f32(<16 x float> %p)
+
+define double @test1(double %a, double %b) nounwind {
+; CHECK-LABEL: test1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vucomisd %xmm1, %xmm0 # sched: [2:1.00]
+; CHECK-NEXT:    jne .LBB64_1 # sched: [1:1.00]
+; CHECK-NEXT:    jnp .LBB64_2 # sched: [1:1.00]
+; CHECK-NEXT:  .LBB64_1: # %l1
+; CHECK-NEXT:    vsubsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:  .LBB64_2: # %l2
+; CHECK-NEXT:    vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %tobool = fcmp une double %a, %b
+  br i1 %tobool, label %l1, label %l2
+
+l1:
+  %c = fsub double %a, %b
+  ret double %c
+l2:
+  %c1 = fadd double %a, %b
+  ret double %c1
+}
+
+define float @test2(float %a, float %b) nounwind {
+; CHECK-LABEL: test2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vucomiss %xmm0, %xmm1 # sched: [2:1.00]
+; CHECK-NEXT:    jbe .LBB65_2 # sched: [1:1.00]
+; CHECK-NEXT:  # BB#1: # %l1
+; CHECK-NEXT:    vsubss %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:  .LBB65_2: # %l2
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %tobool = fcmp olt float %a, %b
+  br i1 %tobool, label %l1, label %l2
+
+l1:
+  %c = fsub float %a, %b
+  ret float %c
+l2:
+  %c1 = fadd float %a, %b
+  ret float %c1
+}
+
+define i32 @test3(float %a, float %b) {
+; CHECK-LABEL: test3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcmpeqss %xmm1, %xmm0, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    movzbl %al, %eax # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test3:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vcmpeqss %xmm1, %xmm0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    movzbl %al, %eax
+; SKX-NEXT:    retq
+
+  %cmp10.i = fcmp oeq float %a, %b
+  %conv11.i = zext i1 %cmp10.i to i32
+  ret i32 %conv11.i
+}
+
+define float @test5(float %p) #0 {
+; CHECK-LABEL: test5:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vucomiss %xmm1, %xmm0 # sched: [2:1.00]
+; CHECK-NEXT:    jne .LBB67_1 # sched: [1:1.00]
+; CHECK-NEXT:    jp .LBB67_1 # sched: [1:1.00]
+; CHECK-NEXT:  # BB#2: # %return
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:  .LBB67_1: # %if.end
+; CHECK-NEXT:    seta %al # sched: [2:1.00]
+; CHECK-NEXT:    movzbl %al, %eax # sched: [1:0.25]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [1:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %cmp = fcmp oeq float %p, 0.000000e+00
+  br i1 %cmp, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  %cmp1 = fcmp ogt float %p, 0.000000e+00
+  %cond = select i1 %cmp1, float 1.000000e+00, float -1.000000e+00
+  br label %return
+
+return:                                           ; preds = %if.end, %entry
+  %retval.0 = phi float [ %cond, %if.end ], [ %p, %entry ]
+  ret float %retval.0
+}
+
+define i32 @test6(i32 %a, i32 %b) {
+; CHECK-LABEL: test6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    xorl %eax, %eax # sched: [1:0.25]
+; CHECK-NEXT:    cmpl %esi, %edi # sched: [1:0.25]
+; CHECK-NEXT:    sete %al # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %cmp = icmp eq i32 %a, %b
+  %res = zext i1 %cmp to i32
+  ret i32 %res
+}
+
+define i32 @test7(double %x, double %y) #2 {
+; CHECK-LABEL: test7:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xorl %eax, %eax # sched: [1:0.25]
+; CHECK-NEXT:    vucomisd %xmm1, %xmm0 # sched: [2:1.00]
+; CHECK-NEXT:    setne %al # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %0 = fcmp one double %x, %y
+  %or = zext i1 %0 to i32
+  ret i32 %or
+}
+
+define i32 @test8(i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: test8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    notl %edi # sched: [1:0.25]
+; CHECK-NEXT:    xorl $-2147483648, %esi # imm = 0x80000000
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    testl %edx, %edx # sched: [1:0.25]
+; CHECK-NEXT:    movl $1, %eax # sched: [1:0.25]
+; CHECK-NEXT:    cmovel %eax, %edx # sched: [1:1.00]
+; CHECK-NEXT:    orl %edi, %esi # sched: [1:0.25]
+; CHECK-NEXT:    cmovnel %edx, %eax # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %tmp1 = icmp eq i32 %a1, -1
+  %tmp2 = icmp eq i32 %a2, -2147483648
+  %tmp3 = and i1 %tmp1, %tmp2
+  %tmp4 = icmp eq i32 %a3, 0
+  %tmp5 = or i1 %tmp3, %tmp4
+  %res = select i1 %tmp5, i32 1, i32 %a3
+  ret i32 %res
+}
+
+define i32 @test9(i64 %a) {
+; CHECK-LABEL: test9:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    testb $1, %dil # sched: [1:0.25]
+; CHECK-NEXT:    jne .LBB71_2 # sched: [1:1.00]
+; CHECK-NEXT:  # BB#1: # %A
+; CHECK-NEXT:    movl $6, %eax # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:  .LBB71_2: # %B
+; CHECK-NEXT:    movl $7, %eax # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+ %b = and i64 %a, 1
+ %cmp10.i = icmp eq i64 %b, 0
+ br i1 %cmp10.i, label %A, label %B
+A:
+ ret i32 6
+B:
+ ret i32 7
+}
+
+define i32 @test10(i64 %b, i64 %c, i1 %d) {
+; CHECK-LABEL: test10:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl %edx, %eax # sched: [1:0.25]
+; CHECK-NEXT:    andb $1, %al # sched: [1:0.25]
+; CHECK-NEXT:    cmpq %rsi, %rdi # sched: [1:0.25]
+; CHECK-NEXT:    sete %cl # sched: [1:1.00]
+; CHECK-NEXT:    orb %dl, %cl # sched: [1:0.25]
+; CHECK-NEXT:    andb $1, %cl # sched: [1:0.25]
+; CHECK-NEXT:    cmpb %cl, %al # sched: [1:0.25]
+; CHECK-NEXT:    je .LBB72_1 # sched: [1:1.00]
+; CHECK-NEXT:  # BB#2: # %if.end.i
+; CHECK-NEXT:    movl $6, %eax # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:  .LBB72_1: # %if.then.i
+; CHECK-NEXT:    movl $5, %eax # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+
+  %cmp8.i = icmp eq i64 %b, %c
+  %or1 = or i1 %d, %cmp8.i
+  %xor1 = xor i1 %d, %or1
+  br i1 %xor1, label %if.end.i, label %if.then.i
+
+if.then.i:
+ ret i32 5
+
+if.end.i:
+  ret i32 6
+}
+
+define <16 x float> @sitof32(<16 x i32> %a) nounwind {
+; CHECK-LABEL: sitof32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = sitofp <16 x i32> %a to <16 x float>
+  ret <16 x float> %b
+}
+
+define <8 x double> @sltof864(<8 x i64> %a) {
+; CHECK-LABEL: sltof864:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtqq2pd %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = sitofp <8 x i64> %a to <8 x double>
+  ret <8 x double> %b
+}
+
+define <4 x double> @slto4f64(<4 x i64> %a) {
+; CHECK-LABEL: slto4f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtqq2pd %ymm0, %ymm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; VLDQ-LABEL: slto4f64:
+; VLDQ:       # BB#0:
+; VLDQ-NEXT:    vcvtqq2pd %ymm0, %ymm0
+; VLDQ-NEXT:    retq
+  %b = sitofp <4 x i64> %a to <4 x double>
+  ret <4 x double> %b
+}
+
+define <2 x double> @slto2f64(<2 x i64> %a) {
+; CHECK-LABEL: slto2f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtqq2pd %xmm0, %xmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; VLDQ-LABEL: slto2f64:
+; VLDQ:       # BB#0:
+; VLDQ-NEXT:    vcvtqq2pd %xmm0, %xmm0
+; VLDQ-NEXT:    retq
+  %b = sitofp <2 x i64> %a to <2 x double>
+  ret <2 x double> %b
+}
+
+define <2 x float> @sltof2f32(<2 x i64> %a) {
+; CHECK-LABEL: sltof2f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtqq2ps %xmm0, %xmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; VLDQ-LABEL: sltof2f32:
+; VLDQ:       # BB#0:
+; VLDQ-NEXT:    vcvtqq2ps %xmm0, %xmm0
+; VLDQ-NEXT:    retq
+  %b = sitofp <2 x i64> %a to <2 x float>
+  ret <2 x float>%b
+}
+
+define <4 x float> @slto4f32_mem(<4 x i64>* %a) {
+; CHECK-LABEL: slto4f32_mem:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtqq2psy (%rdi), %xmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; VLDQ-LABEL: slto4f32_mem:
+; VLDQ:       # BB#0:
+; VLDQ-NEXT:    vcvtqq2psy (%rdi), %xmm0
+; VLDQ-NEXT:    retq
+  %a1 = load <4 x i64>, <4 x i64>* %a, align 8
+  %b = sitofp <4 x i64> %a1 to <4 x float>
+  ret <4 x float>%b
+}
+
+define <4 x i64> @f64to4sl(<4 x double> %a) {
+; CHECK-LABEL: f64to4sl:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttpd2qq %ymm0, %ymm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; VLDQ-LABEL: f64to4sl:
+; VLDQ:       # BB#0:
+; VLDQ-NEXT:    vcvttpd2qq %ymm0, %ymm0
+; VLDQ-NEXT:    retq
+  %b = fptosi <4 x double> %a to <4 x i64>
+  ret <4 x i64> %b
+}
+
+define <4 x i64> @f32to4sl(<4 x float> %a) {
+; CHECK-LABEL: f32to4sl:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttps2qq %xmm0, %ymm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; VLDQ-LABEL: f32to4sl:
+; VLDQ:       # BB#0:
+; VLDQ-NEXT:    vcvttps2qq %xmm0, %ymm0
+; VLDQ-NEXT:    retq
+  %b = fptosi <4 x float> %a to <4 x i64>
+  ret <4 x i64> %b
+}
+
+define <4 x float> @slto4f32(<4 x i64> %a) {
+; CHECK-LABEL: slto4f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtqq2ps %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; VLDQ-LABEL: slto4f32:
+; VLDQ:       # BB#0:
+; VLDQ-NEXT:    vcvtqq2ps %ymm0, %xmm0
+; VLDQ-NEXT:    vzeroupper
+; VLDQ-NEXT:    retq
+  %b = sitofp <4 x i64> %a to <4 x float>
+  ret <4 x float> %b
+}
+
+define <4 x float> @ulto4f32(<4 x i64> %a) {
+; CHECK-LABEL: ulto4f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtuqq2ps %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; VLDQ-LABEL: ulto4f32:
+; VLDQ:       # BB#0:
+; VLDQ-NEXT:    vcvtuqq2ps %ymm0, %xmm0
+; VLDQ-NEXT:    vzeroupper
+; VLDQ-NEXT:    retq
+  %b = uitofp <4 x i64> %a to <4 x float>
+  ret <4 x float> %b
+}
+
+define <8 x double> @ulto8f64(<8 x i64> %a) {
+; CHECK-LABEL: ulto8f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtuqq2pd %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = uitofp <8 x i64> %a to <8 x double>
+  ret <8 x double> %b
+}
+
+define <16 x double> @ulto16f64(<16 x i64> %a) {
+; CHECK-LABEL: ulto16f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtuqq2pd %zmm0, %zmm0
+; CHECK-NEXT:    vcvtuqq2pd %zmm1, %zmm1
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = uitofp <16 x i64> %a to <16 x double>
+  ret <16 x double> %b
+}
+
+define <16 x i32> @f64to16si(<16 x float> %a) nounwind {
+; CHECK-LABEL: f64to16si:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttps2dq %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = fptosi <16 x float> %a to <16 x i32>
+  ret <16 x i32> %b
+}
+
+define <16 x i32> @f32to16ui(<16 x float> %a) nounwind {
+; CHECK-LABEL: f32to16ui:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttps2udq %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = fptoui <16 x float> %a to <16 x i32>
+  ret <16 x i32> %b
+}
+
+define <16 x i8> @f32to16uc(<16 x float> %f) {
+; CHECK-LABEL: f32to16uc:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttps2udq %zmm0, %zmm0
+; CHECK-NEXT:    vpmovdb %zmm0, %xmm0
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = fptoui <16 x float> %f to <16 x i8>
+  ret <16 x i8> %res
+}
+
+define <16 x i16> @f32to16us(<16 x float> %f) {
+; CHECK-LABEL: f32to16us:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttps2udq %zmm0, %zmm0
+; CHECK-NEXT:    vpmovdw %zmm0, %ymm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = fptoui <16 x float> %f to <16 x i16>
+  ret <16 x i16> %res
+}
+
+define <8 x i32> @f32to8ui(<8 x float> %a) nounwind {
+; CHECK-LABEL: f32to8ui:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttps2udq %ymm0, %ymm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = fptoui <8 x float> %a to <8 x i32>
+  ret <8 x i32> %b
+}
+
+define <4 x i32> @f32to4ui(<4 x float> %a) nounwind {
+; CHECK-LABEL: f32to4ui:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttps2udq %xmm0, %xmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = fptoui <4 x float> %a to <4 x i32>
+  ret <4 x i32> %b
+}
+
+define <8 x i32> @f64to8ui(<8 x double> %a) nounwind {
+; CHECK-LABEL: f64to8ui:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttpd2udq %zmm0, %ymm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = fptoui <8 x double> %a to <8 x i32>
+  ret <8 x i32> %b
+}
+
+define <8 x i16> @f64to8us(<8 x double> %f) {
+; CHECK-LABEL: f64to8us:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; CHECK-NEXT:    vpmovdw %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = fptoui <8 x double> %f to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define <8 x i8> @f64to8uc(<8 x double> %f) {
+; CHECK-LABEL: f64to8uc:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; CHECK-NEXT:    vpmovdw %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = fptoui <8 x double> %f to <8 x i8>
+  ret <8 x i8> %res
+}
+
+define <4 x i32> @f64to4ui(<4 x double> %a) nounwind {
+; CHECK-LABEL: f64to4ui:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttpd2udq %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = fptoui <4 x double> %a to <4 x i32>
+  ret <4 x i32> %b
+}
+
+define <8 x double> @sito8f64(<8 x i32> %a) {
+; CHECK-LABEL: sito8f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = sitofp <8 x i32> %a to <8 x double>
+  ret <8 x double> %b
+}
+define <8 x double> @i32to8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind {
+; CHECK-LABEL: i32to8f64_mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; VLBW-LABEL: i32to8f64_mask:
+; VLBW:       # BB#0:
+; VLBW-NEXT:    kmovd %edi, %k1
+; VLBW-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
+; VLBW-NEXT:    retq
+; VLNOBW-LABEL: i32to8f64_mask:
+; VLNOBW:       # BB#0:
+; VLNOBW-NEXT:    kmovw %edi, %k1
+; VLNOBW-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
+; VLNOBW-NEXT:    retq
+  %1 = bitcast i8 %c to <8 x i1>
+  %2 = sitofp <8 x i32> %b to <8 x double>
+  %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a
+  ret <8 x double> %3
+}
+define <8 x double> @sito8f64_maskz(<8 x i32> %a, i8 %b) nounwind {
+; CHECK-LABEL: sito8f64_maskz:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; VLBW-LABEL: sito8f64_maskz:
+; VLBW:       # BB#0:
+; VLBW-NEXT:    kmovd %edi, %k1
+; VLBW-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
+; VLBW-NEXT:    retq
+; VLNOBW-LABEL: sito8f64_maskz:
+; VLNOBW:       # BB#0:
+; VLNOBW-NEXT:    kmovw %edi, %k1
+; VLNOBW-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
+; VLNOBW-NEXT:    retq
+  %1 = bitcast i8 %b to <8 x i1>
+  %2 = sitofp <8 x i32> %a to <8 x double>
+  %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer
+  ret <8 x double> %3
+}
+
+define <8 x i32> @f64to8si(<8 x double> %a) {
+; CHECK-LABEL: f64to8si:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = fptosi <8 x double> %a to <8 x i32>
+  ret <8 x i32> %b
+}
+
+define <4 x i32> @f64to4si(<4 x double> %a) {
+; CHECK-LABEL: f64to4si:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttpd2dq %ymm0, %xmm0 # sched: [7:1.00]
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = fptosi <4 x double> %a to <4 x i32>
+  ret <4 x i32> %b
+}
+
+define <16 x float> @f64to16f32(<16 x double> %b) nounwind {
+; CHECK-LABEL: f64to16f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtpd2ps %zmm0, %ymm0
+; CHECK-NEXT:    vcvtpd2ps %zmm1, %ymm1
+; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a = fptrunc <16 x double> %b to <16 x float>
+  ret <16 x float> %a
+}
+
+define <4 x float> @f64to4f32(<4 x double> %b) {
+; CHECK-LABEL: f64to4f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtpd2ps %ymm0, %xmm0 # sched: [7:1.00]
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a = fptrunc <4 x double> %b to <4 x float>
+  ret <4 x float> %a
+}
+
+define <4 x float> @f64to4f32_mask(<4 x double> %b, <4 x i1> %mask) {
+; CHECK-LABEL: f64to4f32_mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpslld $31, %xmm1, %xmm1 # sched: [1:1.00]
+; CHECK-NEXT:    vptestmd %xmm1, %xmm1, %k1
+; CHECK-NEXT:    vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a = fptrunc <4 x double> %b to <4 x float>
+  %c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer
+  ret <4 x float> %c
+}
+
+define <4 x float> @f64tof32_inreg(<2 x double> %a0, <4 x float> %a1) nounwind {
+; CHECK-LABEL: f64tof32_inreg:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtsd2ss %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %ext = extractelement <2 x double> %a0, i32 0
+  %cvt = fptrunc double %ext to float
+  %res = insertelement <4 x float> %a1, float %cvt, i32 0
+  ret <4 x float> %res
+}
+
+define <8 x double> @f32to8f64(<8 x float> %b) nounwind {
+; CHECK-LABEL: f32to8f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtps2pd %ymm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a = fpext <8 x float> %b to <8 x double>
+  ret <8 x double> %a
+}
+
+define <4 x double> @f32to4f64_mask(<4 x float> %b, <4 x double> %b1, <4 x double> %a1) {
+; CHECK-LABEL: f32to4f64_mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcmpltpd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vcvtps2pd %xmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a = fpext <4 x float> %b to <4 x double>
+  %mask = fcmp ogt <4 x double> %a1, %b1
+  %c = select <4 x i1> %mask, <4 x double> %a, <4 x double> zeroinitializer
+  ret <4 x double> %c
+}
+
+define <2 x double> @f32tof64_inreg(<2 x double> %a0, <4 x float> %a1) nounwind {
+; CHECK-LABEL: f32tof64_inreg:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtss2sd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %ext = extractelement <4 x float> %a1, i32 0
+  %cvt = fpext float %ext to double
+  %res = insertelement <2 x double> %a0, double %cvt, i32 0
+  ret <2 x double> %res
+}
+
+define double @sltof64_load(i64* nocapture %e) {
+; CHECK-LABEL: sltof64_load:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vcvtsi2sdq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %tmp1 = load i64, i64* %e, align 8
+  %conv = sitofp i64 %tmp1 to double
+  ret double %conv
+}
+
+define double @sitof64_load(i32* %e) {
+; CHECK-LABEL: sitof64_load:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vcvtsi2sdl (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %tmp1 = load i32, i32* %e, align 4
+  %conv = sitofp i32 %tmp1 to double
+  ret double %conv
+}
+
+define float @sitof32_load(i32* %e) {
+; CHECK-LABEL: sitof32_load:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vcvtsi2ssl (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %tmp1 = load i32, i32* %e, align 4
+  %conv = sitofp i32 %tmp1 to float
+  ret float %conv
+}
+
+define float @sltof32_load(i64* %e) {
+; CHECK-LABEL: sltof32_load:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vcvtsi2ssq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %tmp1 = load i64, i64* %e, align 8
+  %conv = sitofp i64 %tmp1 to float
+  ret float %conv
+}
+
+define void @f32tof64_loadstore() {
+; CHECK-LABEL: f32tof64_loadstore:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [1:0.50]
+; CHECK-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; CHECK-NEXT:    vmovsd %xmm0, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %f = alloca float, align 4
+  %d = alloca double, align 8
+  %tmp = load float, float* %f, align 4
+  %conv = fpext float %tmp to double
+  store double %conv, double* %d, align 8
+  ret void
+}
+
+define void @f64tof32_loadstore() nounwind uwtable {
+; CHECK-LABEL: f64tof32_loadstore:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [1:0.50]
+; CHECK-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [5:1.00]
+; CHECK-NEXT:    vmovss %xmm0, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %f = alloca float, align 4
+  %d = alloca double, align 8
+  %tmp = load double, double* %d, align 8
+  %conv = fptrunc double %tmp to float
+  store float %conv, float* %f, align 4
+  ret void
+}
+
+define double @long_to_double(i64 %x) {
+; CHECK-LABEL: long_to_double:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovq %rdi, %xmm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+   %res = bitcast i64 %x to double
+   ret double %res
+}
+
+define i64 @double_to_long(double %x) {
+; CHECK-LABEL: double_to_long:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovq %xmm0, %rax # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+   %res = bitcast double %x to i64
+   ret i64 %res
+}
+
+define float @int_to_float(i32 %x) {
+; CHECK-LABEL: int_to_float:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovd %edi, %xmm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+   %res = bitcast i32 %x to float
+   ret float %res
+}
+
+define i32 @float_to_int(float %x) {
+; CHECK-LABEL: float_to_int:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovd %xmm0, %eax # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+   %res = bitcast float %x to i32
+   ret i32 %res
+}
+
+define <16 x double> @uito16f64(<16 x i32> %a) nounwind {
+; CHECK-LABEL: uito16f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtudq2pd %ymm0, %zmm2
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; CHECK-NEXT:    vcvtudq2pd %ymm0, %zmm1
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = uitofp <16 x i32> %a to <16 x double>
+  ret <16 x double> %b
+}
+
+define <8 x float> @slto8f32(<8 x i64> %a) {
+; CHECK-LABEL: slto8f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtqq2ps %zmm0, %ymm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = sitofp <8 x i64> %a to <8 x float>
+  ret <8 x float> %b
+}
+
+define <16 x float> @slto16f32(<16 x i64> %a) {
+; CHECK-LABEL: slto16f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtqq2ps %zmm0, %ymm0
+; CHECK-NEXT:    vcvtqq2ps %zmm1, %ymm1
+; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = sitofp <16 x i64> %a to <16 x float>
+  ret <16 x float> %b
+}
+
+define <8 x double> @slto8f64(<8 x i64> %a) {
+; CHECK-LABEL: slto8f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtqq2pd %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = sitofp <8 x i64> %a to <8 x double>
+  ret <8 x double> %b
+}
+
+define <16 x double> @slto16f64(<16 x i64> %a) {
+; CHECK-LABEL: slto16f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtqq2pd %zmm0, %zmm0
+; CHECK-NEXT:    vcvtqq2pd %zmm1, %zmm1
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = sitofp <16 x i64> %a to <16 x double>
+  ret <16 x double> %b
+}
+
+define <8 x float> @ulto8f32(<8 x i64> %a) {
+; CHECK-LABEL: ulto8f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtuqq2ps %zmm0, %ymm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = uitofp <8 x i64> %a to <8 x float>
+  ret <8 x float> %b
+}
+
+define <16 x float> @ulto16f32(<16 x i64> %a) {
+; CHECK-LABEL: ulto16f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtuqq2ps %zmm0, %ymm0
+; CHECK-NEXT:    vcvtuqq2ps %zmm1, %ymm1
+; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = uitofp <16 x i64> %a to <16 x float>
+  ret <16 x float> %b
+}
+
+define <8 x double> @uito8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind {
+; CHECK-LABEL: uito8f64_mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; VLBW-LABEL: uito8f64_mask:
+; VLBW:       # BB#0:
+; VLBW-NEXT:    kmovd %edi, %k1
+; VLBW-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
+; VLBW-NEXT:    retq
+; VLNOBW-LABEL: uito8f64_mask:
+; VLNOBW:       # BB#0:
+; VLNOBW-NEXT:    kmovw %edi, %k1
+; VLNOBW-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
+; VLNOBW-NEXT:    retq
+  %1 = bitcast i8 %c to <8 x i1>
+  %2 = uitofp <8 x i32> %b to <8 x double>
+  %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a
+  ret <8 x double> %3
+}
+define <8 x double> @uito8f64_maskz(<8 x i32> %a, i8 %b) nounwind {
+; CHECK-LABEL: uito8f64_maskz:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; VLBW-LABEL: uito8f64_maskz:
+; VLBW:       # BB#0:
+; VLBW-NEXT:    kmovd %edi, %k1
+; VLBW-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
+; VLBW-NEXT:    retq
+; VLNOBW-LABEL: uito8f64_maskz:
+; VLNOBW:       # BB#0:
+; VLNOBW-NEXT:    kmovw %edi, %k1
+; VLNOBW-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
+; VLNOBW-NEXT:    retq
+  %1 = bitcast i8 %b to <8 x i1>
+  %2 = uitofp <8 x i32> %a to <8 x double>
+  %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer
+  ret <8 x double> %3
+}
+
+define <4 x double> @uito4f64(<4 x i32> %a) nounwind {
+; CHECK-LABEL: uito4f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtudq2pd %xmm0, %ymm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = uitofp <4 x i32> %a to <4 x double>
+  ret <4 x double> %b
+}
+
+define <16 x float> @uito16f32(<16 x i32> %a) nounwind {
+; CHECK-LABEL: uito16f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = uitofp <16 x i32> %a to <16 x float>
+  ret <16 x float> %b
+}
+
+define <8 x double> @uito8f64(<8 x i32> %a) {
+; CHECK-LABEL: uito8f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = uitofp <8 x i32> %a to <8 x double>
+  ret <8 x double> %b
+}
+
+define <8 x float> @uito8f32(<8 x i32> %a) nounwind {
+; CHECK-LABEL: uito8f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtudq2ps %ymm0, %ymm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = uitofp <8 x i32> %a to <8 x float>
+  ret <8 x float> %b
+}
+
+define <4 x float> @uito4f32(<4 x i32> %a) nounwind {
+; CHECK-LABEL: uito4f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtudq2ps %xmm0, %xmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = uitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %b
+}
+
+define i32 @fptosi(float %a) nounwind {
+; CHECK-LABEL: fptosi:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttss2si %xmm0, %eax # sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = fptosi float %a to i32
+  ret i32 %b
+}
+
+define i32 @fptoui(float %a) nounwind {
+; CHECK-LABEL: fptoui:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvttss2usi %xmm0, %eax
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = fptoui float %a to i32
+  ret i32 %b
+}
+
+define float @uitof32(i32 %a) nounwind {
+; CHECK-LABEL: uitof32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtusi2ssl %edi, %xmm0, %xmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = uitofp i32 %a to float
+  ret float %b
+}
+
+define double @uitof64(i32 %a) nounwind {
+; CHECK-LABEL: uitof64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtusi2sdl %edi, %xmm0, %xmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = uitofp i32 %a to double
+  ret double %b
+}
+
+define <16 x float> @sbto16f32(<16 x i32> %a) {
+; CHECK-LABEL: sbto16f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
+; CHECK-NEXT:    vpmovm2d %k0, %zmm0
+; CHECK-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = icmp slt <16 x i32> %a, zeroinitializer
+  %1 = sitofp <16 x i1> %mask to <16 x float>
+  ret <16 x float> %1
+}
+
+define <16 x float> @scto16f32(<16 x i8> %a) {
+; CHECK-LABEL: scto16f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxbd %xmm0, %zmm0
+; CHECK-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %1 = sitofp <16 x i8> %a to <16 x float>
+  ret <16 x float> %1
+}
+
+define <16 x float> @ssto16f32(<16 x i16> %a) {
+; CHECK-LABEL: ssto16f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxwd %ymm0, %zmm0
+; CHECK-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %1 = sitofp <16 x i16> %a to <16 x float>
+  ret <16 x float> %1
+}
+
+define <8 x double> @ssto16f64(<8 x i16> %a) {
+; CHECK-LABEL: ssto16f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxwd %xmm0, %ymm0 # sched: [3:1.00]
+; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %1 = sitofp <8 x i16> %a to <8 x double>
+  ret <8 x double> %1
+}
+
+define <8 x double> @scto8f64(<8 x i8> %a) {
+; CHECK-LABEL: scto8f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
+; CHECK-NEXT:    vpslld $24, %ymm0, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpsrad $24, %ymm0, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %1 = sitofp <8 x i8> %a to <8 x double>
+  ret <8 x double> %1
+}
+
+define <16 x double> @scto16f64(<16 x i8> %a) {
+; CHECK-LABEL: scto16f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxbd %xmm0, %zmm1
+; CHECK-NEXT:    vcvtdq2pd %ymm1, %zmm0
+; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; CHECK-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = sitofp <16 x i8> %a to <16 x double>
+  ret <16 x double> %b
+}
+
+define <16 x double> @sbto16f64(<16 x double> %a) {
+; NOVLDQ-LABEL: sbto16f64:
+; NOVLDQ:       # BB#0:
+; NOVLDQ-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; NOVLDQ-NEXT:    vcmpltpd %zmm1, %zmm2, %k1
+; NOVLDQ-NEXT:    vcmpltpd %zmm0, %zmm2, %k2
+; NOVLDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; NOVLDQ-NEXT:    vpmovqd %zmm0, %ymm0
+; NOVLDQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; NOVLDQ-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NOVLDQ-NEXT:    vpmovqd %zmm1, %ymm1
+; NOVLDQ-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; NOVLDQ-NEXT:    retq
+;
+; VLDQ-LABEL: sbto16f64:
+; VLDQ:       # BB#0:
+; VLDQ-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; VLDQ-NEXT:    vcmpltpd %zmm1, %zmm2, %k0
+; VLDQ-NEXT:    vcmpltpd %zmm0, %zmm2, %k1
+; VLDQ-NEXT:    vpmovm2d %k1, %ymm0
+; VLDQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; VLDQ-NEXT:    vpmovm2d %k0, %ymm1
+; VLDQ-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; VLDQ-NEXT:    retq
+;
+; VLNODQ-LABEL: sbto16f64:
+; VLNODQ:       # BB#0:
+; VLNODQ-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; VLNODQ-NEXT:    vcmpltpd %zmm1, %zmm2, %k1
+; VLNODQ-NEXT:    vcmpltpd %zmm0, %zmm2, %k2
+; VLNODQ-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; VLNODQ-NEXT:    vmovdqa32 %ymm1, %ymm0 {%k2} {z}
+; VLNODQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; VLNODQ-NEXT:    vmovdqa32 %ymm1, %ymm1 {%k1} {z}
+; VLNODQ-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; VLNODQ-NEXT:    retq
+;
+; CHECK-LABEL: sbto16f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:0.50]
+; CHECK-NEXT:    vcmpltpd %zmm1, %zmm2, %k0
+; CHECK-NEXT:    vcmpltpd %zmm0, %zmm2, %k1
+; CHECK-NEXT:    vpmovm2d %k1, %ymm0
+; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; CHECK-NEXT:    vpmovm2d %k0, %ymm1
+; CHECK-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %cmpres = fcmp ogt <16 x double> %a, zeroinitializer
+  %1 = sitofp <16 x i1> %cmpres to <16 x double>
+  ret <16 x double> %1
+}
+
+define <8 x double> @sbto8f64(<8 x double> %a) {
+; NOVLDQ-LABEL: sbto8f64:
+; NOVLDQ:       # BB#0:
+; NOVLDQ-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; NOVLDQ-NEXT:    vcmpltpd %zmm0, %zmm1, %k1
+; NOVLDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT:    vpmovqd %zmm0, %ymm0
+; NOVLDQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; NOVLDQ-NEXT:    retq
+;
+; VLDQ-LABEL: sbto8f64:
+; VLDQ:       # BB#0:
+; VLDQ-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; VLDQ-NEXT:    vcmpltpd %zmm0, %zmm1, %k0
+; VLDQ-NEXT:    vpmovm2d %k0, %ymm0
+; VLDQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; VLDQ-NEXT:    retq
+;
+; VLNODQ-LABEL: sbto8f64:
+; VLNODQ:       # BB#0:
+; VLNODQ-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; VLNODQ-NEXT:    vcmpltpd %zmm0, %zmm1, %k1
+; VLNODQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; VLNODQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; VLNODQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; VLNODQ-NEXT:    retq
+;
+; CHECK-LABEL: sbto8f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vcmpltpd %zmm0, %zmm1, %k0
+; CHECK-NEXT:    vpmovm2d %k0, %ymm0
+; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %cmpres = fcmp ogt <8 x double> %a, zeroinitializer
+  %1 = sitofp <8 x i1> %cmpres to <8 x double>
+  ret <8 x double> %1
+}
+
+define <8 x float> @sbto8f32(<8 x float> %a) {
+; NOVLDQ-LABEL: sbto8f32:
+; NOVLDQ:       # BB#0:
+; NOVLDQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NOVLDQ-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; NOVLDQ-NEXT:    vcmpltps %zmm0, %zmm1, %k1
+; NOVLDQ-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NOVLDQ-NEXT:    vpmovqd %zmm0, %ymm0
+; NOVLDQ-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; NOVLDQ-NEXT:    retq
+;
+; VLDQ-LABEL: sbto8f32:
+; VLDQ:       # BB#0:
+; VLDQ-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; VLDQ-NEXT:    vcmpltps %ymm0, %ymm1, %k0
+; VLDQ-NEXT:    vpmovm2d %k0, %ymm0
+; VLDQ-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; VLDQ-NEXT:    retq
+;
+; VLNODQ-LABEL: sbto8f32:
+; VLNODQ:       # BB#0:
+; VLNODQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VLNODQ-NEXT:    vcmpltps %ymm0, %ymm1, %k1
+; VLNODQ-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; VLNODQ-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; VLNODQ-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; VLNODQ-NEXT:    retq
+;
+; CHECK-LABEL: sbto8f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vcmpltps %ymm0, %ymm1, %k0
+; CHECK-NEXT:    vpmovm2d %k0, %ymm0
+; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0 # sched: [4:0.33]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %cmpres = fcmp ogt <8 x float> %a, zeroinitializer
+  %1 = sitofp <8 x i1> %cmpres to <8 x float>
+  ret <8 x float> %1
+}
+
+define <4 x float> @sbto4f32(<4 x float> %a) {
+; CHECK-LABEL: sbto4f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vcmpltps %xmm0, %xmm1, %k0
+; CHECK-NEXT:    vpmovm2d %k0, %xmm0
+; CHECK-NEXT:    vcvtdq2ps %xmm0, %xmm0 # sched: [4:0.33]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; VLDQ-LABEL: sbto4f32:
+; VLDQ:       # BB#0:
+; VLDQ-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; VLDQ-NEXT:    vcmpltps %xmm0, %xmm1, %k0
+; VLDQ-NEXT:    vpmovm2d %k0, %xmm0
+; VLDQ-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; VLDQ-NEXT:    retq
+; VLNODQ-LABEL: sbto4f32:
+; VLNODQ:       # BB#0:
+; VLNODQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VLNODQ-NEXT:    vcmpltps %xmm0, %xmm1, %k1
+; VLNODQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; VLNODQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; VLNODQ-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; VLNODQ-NEXT:    retq
+  %cmpres = fcmp ogt <4 x float> %a, zeroinitializer
+  %1 = sitofp <4 x i1> %cmpres to <4 x float>
+  ret <4 x float> %1
+}
+
+define <4 x double> @sbto4f64(<4 x double> %a) {
+; CHECK-LABEL: sbto4f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vcmpltpd %ymm0, %ymm1, %k0
+; CHECK-NEXT:    vpmovm2d %k0, %xmm0
+; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm0 # sched: [7:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; VLDQ-LABEL: sbto4f64:
+; VLDQ:       # BB#0:
+; VLDQ-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; VLDQ-NEXT:    vcmpltpd %ymm0, %ymm1, %k0
+; VLDQ-NEXT:    vpmovm2d %k0, %xmm0
+; VLDQ-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; VLDQ-NEXT:    retq
+; VLNODQ-LABEL: sbto4f64:
+; VLNODQ:       # BB#0:
+; VLNODQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VLNODQ-NEXT:    vcmpltpd %ymm0, %ymm1, %k1
+; VLNODQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; VLNODQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; VLNODQ-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; VLNODQ-NEXT:    retq
+  %cmpres = fcmp ogt <4 x double> %a, zeroinitializer
+  %1 = sitofp <4 x i1> %cmpres to <4 x double>
+  ret <4 x double> %1
+}
+
+define <2 x float> @sbto2f32(<2 x float> %a) {
+; CHECK-LABEL: sbto2f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vcmpltps %xmm0, %xmm1, %k0
+; CHECK-NEXT:    vpmovm2d %k0, %xmm0
+; CHECK-NEXT:    vcvtdq2ps %xmm0, %xmm0 # sched: [4:0.33]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; VLDQ-LABEL: sbto2f32:
+; VLDQ:       # BB#0:
+; VLDQ-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; VLDQ-NEXT:    vcmpltps %xmm0, %xmm1, %k0
+; VLDQ-NEXT:    vpmovm2d %k0, %xmm0
+; VLDQ-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; VLDQ-NEXT:    retq
+; VLNODQ-LABEL: sbto2f32:
+; VLNODQ:       # BB#0:
+; VLNODQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VLNODQ-NEXT:    vcmpltps %xmm0, %xmm1, %k1
+; VLNODQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; VLNODQ-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
+; VLNODQ-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; VLNODQ-NEXT:    retq
+  %cmpres = fcmp ogt <2 x float> %a, zeroinitializer
+  %1 = sitofp <2 x i1> %cmpres to <2 x float>
+  ret <2 x float> %1
+}
+
+define <2 x double> @sbto2f64(<2 x double> %a) {
+; CHECK-LABEL: sbto2f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vcmpltpd %xmm0, %xmm1, %k0
+; CHECK-NEXT:    vpmovm2q %k0, %xmm0
+; CHECK-NEXT:    vcvtqq2pd %xmm0, %xmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; VLDQ-LABEL: sbto2f64:
+; VLDQ:       # BB#0:
+; VLDQ-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; VLDQ-NEXT:    vcmpltpd %xmm0, %xmm1, %k0
+; VLDQ-NEXT:    vpmovm2q %k0, %xmm0
+; VLDQ-NEXT:    vcvtqq2pd %xmm0, %xmm0
+; VLDQ-NEXT:    retq
+; VLNODQ-LABEL: sbto2f64:
+; VLNODQ:       # BB#0:
+; VLNODQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VLNODQ-NEXT:    vcmpltpd %xmm0, %xmm1, %k1
+; VLNODQ-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
+; VLNODQ-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
+; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; VLNODQ-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm1
+; VLNODQ-NEXT:    vmovq %xmm0, %rax
+; VLNODQ-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm0
+; VLNODQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; VLNODQ-NEXT:    retq
+  %cmpres = fcmp ogt <2 x double> %a, zeroinitializer
+  %1 = sitofp <2 x i1> %cmpres to <2 x double>
+  ret <2 x double> %1
+}
+
+define <16 x float> @ucto16f32(<16 x i8> %a) {
+; CHECK-LABEL: ucto16f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; CHECK-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = uitofp <16 x i8> %a to <16 x float>
+  ret <16 x float>%b
+}
+
+define <8 x double> @ucto8f64(<8 x i8> %a) {
+; CHECK-LABEL: ucto8f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0 # sched: [1:0.50]
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
+; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = uitofp <8 x i8> %a to <8 x double>
+  ret <8 x double> %b
+}
+
+define <16 x float> @swto16f32(<16 x i16> %a) {
+; CHECK-LABEL: swto16f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxwd %ymm0, %zmm0
+; CHECK-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = sitofp <16 x i16> %a to <16 x float>
+  ret <16 x float> %b
+}
+
+define <8 x double> @swto8f64(<8 x i16> %a) {
+; CHECK-LABEL: swto8f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxwd %xmm0, %ymm0 # sched: [3:1.00]
+; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = sitofp <8 x i16> %a to <8 x double>
+  ret <8 x double> %b
+}
+
+define <16 x double> @swto16f64(<16 x i16> %a) {
+; CHECK-LABEL: swto16f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxwd %ymm0, %zmm1
+; CHECK-NEXT:    vcvtdq2pd %ymm1, %zmm0
+; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; CHECK-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = sitofp <16 x i16> %a to <16 x double>
+  ret <16 x double> %b
+}
+
+define <16 x double> @ucto16f64(<16 x i8> %a) {
+; CHECK-LABEL: ucto16f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; CHECK-NEXT:    vcvtdq2pd %ymm1, %zmm0
+; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; CHECK-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = uitofp <16 x i8> %a to <16 x double>
+  ret <16 x double> %b
+}
+
+define <16 x float> @uwto16f32(<16 x i16> %a) {
+; CHECK-LABEL: uwto16f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; CHECK-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = uitofp <16 x i16> %a to <16 x float>
+  ret <16 x float> %b
+}
+
+define <8 x double> @uwto8f64(<8 x i16> %a) {
+; CHECK-LABEL: uwto8f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
+; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = uitofp <8 x i16> %a to <8 x double>
+  ret <8 x double> %b
+}
+
+define <16 x double> @uwto16f64(<16 x i16> %a) {
+; CHECK-LABEL: uwto16f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; CHECK-NEXT:    vcvtdq2pd %ymm1, %zmm0
+; CHECK-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
+; CHECK-NEXT:    vcvtdq2pd %ymm1, %zmm1
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = uitofp <16 x i16> %a to <16 x double>
+  ret <16 x double> %b
+}
+
+define <16 x float> @sito16f32(<16 x i32> %a) {
+; CHECK-LABEL: sito16f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = sitofp <16 x i32> %a to <16 x float>
+  ret <16 x float> %b
+}
+
+define <16 x double> @sito16f64(<16 x i32> %a) {
+; CHECK-LABEL: sito16f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm2
+; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
+; CHECK-NEXT:    vcvtdq2pd %ymm0, %zmm1
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = sitofp <16 x i32> %a to <16 x double>
+  ret <16 x double> %b
+}
+
+define <16 x float> @usto16f32(<16 x i16> %a) {
+; CHECK-LABEL: usto16f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; CHECK-NEXT:    vcvtdq2ps %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = uitofp <16 x i16> %a to <16 x float>
+  ret <16 x float> %b
+}
+
+define <16 x float> @ubto16f32(<16 x i32> %a) {
+; CHECK-LABEL: ubto16f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
+; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; CHECK-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = icmp slt <16 x i32> %a, zeroinitializer
+  %1 = uitofp <16 x i1> %mask to <16 x float>
+  ret <16 x float> %1
+}
+
+define <16 x double> @ubto16f64(<16 x i32> %a) {
+; CHECK-LABEL: ubto16f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpgtd %zmm0, %zmm1, %k1
+; CHECK-NEXT:    movl {{.*}}(%rip), %eax # sched: [1:0.50]
+; CHECK-NEXT:    vpbroadcastd %eax, %ymm0 {%k1} {z}
+; CHECK-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; CHECK-NEXT:    kshiftrw $8, %k1, %k1
+; CHECK-NEXT:    vpbroadcastd %eax, %ymm1 {%k1} {z}
+; CHECK-NEXT:    vcvtudq2pd %ymm1, %zmm1
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = icmp slt <16 x i32> %a, zeroinitializer
+  %1 = uitofp <16 x i1> %mask to <16 x double>
+  ret <16 x double> %1
+}
+
+define <8 x float> @ubto8f32(<8 x i32> %a) {
+; CHECK-LABEL: ubto8f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpgtd %ymm0, %ymm1, %k1
+; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
+; CHECK-NEXT:    vcvtudq2ps %ymm0, %ymm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = icmp slt <8 x i32> %a, zeroinitializer
+  %1 = uitofp <8 x i1> %mask to <8 x float>
+  ret <8 x float> %1
+}
+
+define <8 x double> @ubto8f64(<8 x i32> %a) {
+; CHECK-LABEL: ubto8f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpgtd %ymm0, %ymm1, %k1
+; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z}
+; CHECK-NEXT:    vcvtudq2pd %ymm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = icmp slt <8 x i32> %a, zeroinitializer
+  %1 = uitofp <8 x i1> %mask to <8 x double>
+  ret <8 x double> %1
+}
+
+define <4 x float> @ubto4f32(<4 x i32> %a) {
+; CHECK-LABEL: ubto4f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpgtd %xmm0, %xmm1, %k1
+; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
+; CHECK-NEXT:    vcvtudq2ps %xmm0, %xmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = icmp slt <4 x i32> %a, zeroinitializer
+  %1 = uitofp <4 x i1> %mask to <4 x float>
+  ret <4 x float> %1
+}
+
+define <4 x double> @ubto4f64(<4 x i32> %a) {
+; CHECK-LABEL: ubto4f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpgtd %xmm0, %xmm1, %k1
+; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
+; CHECK-NEXT:    vcvtudq2pd %xmm0, %ymm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = icmp slt <4 x i32> %a, zeroinitializer
+  %1 = uitofp <4 x i1> %mask to <4 x double>
+  ret <4 x double> %1
+}
+
+define <2 x float> @ubto2f32(<2 x i32> %a) {
+; CHECK-LABEL: ubto2f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.50]
+; CHECK-NEXT:    vpcmpltuq %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
+; CHECK-NEXT:    vcvtudq2ps %xmm0, %xmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = icmp ult <2 x i32> %a, zeroinitializer
+  %1 = uitofp <2 x i1> %mask to <2 x float>
+  ret <2 x float> %1
+}
+
+define <2 x double> @ubto2f64(<2 x i32> %a) {
+; CHECK-LABEL: ubto2f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.50]
+; CHECK-NEXT:    vpcmpltuq %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [5:0.50]
+; CHECK-NEXT:    vcvtuqq2pd %xmm0, %xmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; VLDQ-LABEL: ubto2f64:
+; VLDQ:       # BB#0:
+; VLDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VLDQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; VLDQ-NEXT:    vpcmpltuq %xmm1, %xmm0, %k1
+; VLDQ-NEXT:    vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z}
+; VLDQ-NEXT:    vcvtuqq2pd %xmm0, %xmm0
+; VLDQ-NEXT:    retq
+; VLNODQ-LABEL: ubto2f64:
+; VLNODQ:       # BB#0:
+; VLNODQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; VLNODQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; VLNODQ-NEXT:    vpcmpltuq %xmm1, %xmm0, %k1
+; VLNODQ-NEXT:    vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z}
+; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; VLNODQ-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm1
+; VLNODQ-NEXT:    vmovq %xmm0, %rax
+; VLNODQ-NEXT:    vcvtusi2sdq %rax, %xmm2, %xmm0
+; VLNODQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; VLNODQ-NEXT:    retq
+  %mask = icmp ult <2 x i32> %a, zeroinitializer
+  %1 = uitofp <2 x i1> %mask to <2 x double>
+  ret <2 x double> %1
+}
+
+define <8 x i16> @zext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: zext_8x8mem_to_8x16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovw2m %xmm0, %k1
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_8x8mem_to_8x16:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k1
+; SKX-NEXT:    vpmovzxbw {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; SKX-NEXT:    retq
+  %a   = load <8 x i8>,<8 x i8> *%i,align 1
+  %x   = zext <8 x i8> %a to <8 x i16>
+  %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer
+  ret <8 x i16> %ret
+}
+
+define <8 x i16> @sext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: sext_8x8mem_to_8x16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovw2m %xmm0, %k1
+; CHECK-NEXT:    vpmovsxbw (%rdi), %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: sext_8x8mem_to_8x16:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k1
+; SKX-NEXT:    vpmovsxbw (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <8 x i8>,<8 x i8> *%i,align 1
+  %x   = sext <8 x i8> %a to <8 x i16>
+  %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer
+  ret <8 x i16> %ret
+}
+
+
+define <16 x i16> @zext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: zext_16x8mem_to_16x16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovb2m %xmm0, %k1
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_16x8mem_to_16x16:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT:    vpmovb2m %xmm0, %k1
+; SKX-NEXT:    vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; SKX-NEXT:    retq
+  %a   = load <16 x i8>,<16 x i8> *%i,align 1
+  %x   = zext <16 x i8> %a to <16 x i16>
+  %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer
+  ret <16 x i16> %ret
+}
+
+define <16 x i16> @sext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: sext_16x8mem_to_16x16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovb2m %xmm0, %k1
+; CHECK-NEXT:    vpmovsxbw (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: sext_16x8mem_to_16x16:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT:    vpmovb2m %xmm0, %k1
+; SKX-NEXT:    vpmovsxbw (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <16 x i8>,<16 x i8> *%i,align 1
+  %x   = sext <16 x i8> %a to <16 x i16>
+  %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer
+  ret <16 x i16> %ret
+}
+
+define <16 x i16> @zext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone {
+; CHECK-LABEL: zext_16x8_to_16x16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %x   = zext <16 x i8> %a to <16 x i16>
+  ret <16 x i16> %x
+}
+
+define <16 x i16> @zext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: zext_16x8_to_16x16_mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovb2m %xmm1, %k1
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_16x8_to_16x16_mask:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm1, %xmm1
+; SKX-NEXT:    vpmovb2m %xmm1, %k1
+; SKX-NEXT:    vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; SKX-NEXT:    retq
+  %x   = zext <16 x i8> %a to <16 x i16>
+  %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer
+  ret <16 x i16> %ret
+}
+
+define <16 x i16> @sext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone {
+; CHECK-LABEL: sext_16x8_to_16x16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxbw %xmm0, %ymm0 # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %x   = sext <16 x i8> %a to <16 x i16>
+  ret <16 x i16> %x
+}
+
+define <16 x i16> @sext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: sext_16x8_to_16x16_mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovb2m %xmm1, %k1
+; CHECK-NEXT:    vpmovsxbw %xmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: sext_16x8_to_16x16_mask:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm1, %xmm1
+; SKX-NEXT:    vpmovb2m %xmm1, %k1
+; SKX-NEXT:    vpmovsxbw %xmm0, %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %x   = sext <16 x i8> %a to <16 x i16>
+  %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer
+  ret <16 x i16> %ret
+}
+
+define <32 x i16> @zext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: zext_32x8mem_to_32x16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $7, %ymm0, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovb2m %ymm0, %k1
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_32x8mem_to_32x16:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $7, %ymm0, %ymm0
+; SKX-NEXT:    vpmovb2m %ymm0, %k1
+; SKX-NEXT:    vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero
+; SKX-NEXT:    retq
+  %a   = load <32 x i8>,<32 x i8> *%i,align 1
+  %x   = zext <32 x i8> %a to <32 x i16>
+  %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
+  ret <32 x i16> %ret
+}
+
+define <32 x i16> @sext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: sext_32x8mem_to_32x16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $7, %ymm0, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovb2m %ymm0, %k1
+; CHECK-NEXT:    vpmovsxbw (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: sext_32x8mem_to_32x16:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $7, %ymm0, %ymm0
+; SKX-NEXT:    vpmovb2m %ymm0, %k1
+; SKX-NEXT:    vpmovsxbw (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <32 x i8>,<32 x i8> *%i,align 1
+  %x   = sext <32 x i8> %a to <32 x i16>
+  %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
+  ret <32 x i16> %ret
+}
+
+define <32 x i16> @zext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone {
+; CHECK-LABEL: zext_32x8_to_32x16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_32x8_to_32x16:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; SKX-NEXT:    retq
+  %x   = zext <32 x i8> %a to <32 x i16>
+  ret <32 x i16> %x
+}
+
+define <32 x i16> @zext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: zext_32x8_to_32x16_mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $7, %ymm1, %ymm1 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovb2m %ymm1, %k1
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_32x8_to_32x16_mask:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $7, %ymm1, %ymm1
+; SKX-NEXT:    vpmovb2m %ymm1, %k1
+; SKX-NEXT:    vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; SKX-NEXT:    retq
+  %x   = zext <32 x i8> %a to <32 x i16>
+  %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
+  ret <32 x i16> %ret
+}
+
+define <32 x i16> @sext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone {
+; CHECK-LABEL: sext_32x8_to_32x16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxbw %ymm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: sext_32x8_to_32x16:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpmovsxbw %ymm0, %zmm0
+; SKX-NEXT:    retq
+  %x   = sext <32 x i8> %a to <32 x i16>
+  ret <32 x i16> %x
+}
+
+define <32 x i16> @sext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: sext_32x8_to_32x16_mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $7, %ymm1, %ymm1 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovb2m %ymm1, %k1
+; CHECK-NEXT:    vpmovsxbw %ymm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: sext_32x8_to_32x16_mask:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $7, %ymm1, %ymm1
+; SKX-NEXT:    vpmovb2m %ymm1, %k1
+; SKX-NEXT:    vpmovsxbw %ymm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %x   = sext <32 x i8> %a to <32 x i16>
+  %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
+  ret <32 x i16> %ret
+}
+
+define <4 x i32> @zext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: zext_4x8mem_to_4x32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_4x8mem_to_4x32:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT:    vpmovzxbd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; SKX-NEXT:    retq
+  %a   = load <4 x i8>,<4 x i8> *%i,align 1
+  %x   = zext <4 x i8> %a to <4 x i32>
+  %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer
+  ret <4 x i32> %ret
+}
+
+define <4 x i32> @sext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: sext_4x8mem_to_4x32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; CHECK-NEXT:    vpmovsxbd (%rdi), %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: sext_4x8mem_to_4x32:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT:    vpmovsxbd (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <4 x i8>,<4 x i8> *%i,align 1
+  %x   = sext <4 x i8> %a to <4 x i32>
+  %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer
+  ret <4 x i32> %ret
+}
+
+define <8 x i32> @zext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: zext_8x8mem_to_8x32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovw2m %xmm0, %k1
+; CHECK-NEXT:    vpmovzxbd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_8x8mem_to_8x32:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k1
+; SKX-NEXT:    vpmovzxbd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; SKX-NEXT:    retq
+  %a   = load <8 x i8>,<8 x i8> *%i,align 1
+  %x   = zext <8 x i8> %a to <8 x i32>
+  %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @sext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: sext_8x8mem_to_8x32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovw2m %xmm0, %k1
+; CHECK-NEXT:    vpmovsxbd (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: sext_8x8mem_to_8x32:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k1
+; SKX-NEXT:    vpmovsxbd (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <8 x i8>,<8 x i8> *%i,align 1
+  %x   = sext <8 x i8> %a to <8 x i32>
+  %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
+  ret <8 x i32> %ret
+}
+
+define <16 x i32> @zext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: zext_16x8mem_to_16x32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovb2m %xmm0, %k1
+; CHECK-NEXT:    vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_16x8mem_to_16x32:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT:    vpmovb2m %xmm0, %k1
+; SKX-NEXT:    vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero
+; SKX-NEXT:    retq
+  %a   = load <16 x i8>,<16 x i8> *%i,align 1
+  %x   = zext <16 x i8> %a to <16 x i32>
+  %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @sext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: sext_16x8mem_to_16x32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovb2m %xmm0, %k1
+; CHECK-NEXT:    vpmovsxbd (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: sext_16x8mem_to_16x32:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT:    vpmovb2m %xmm0, %k1
+; SKX-NEXT:    vpmovsxbd (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <16 x i8>,<16 x i8> *%i,align 1
+  %x   = sext <16 x i8> %a to <16 x i32>
+  %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @zext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: zext_16x8_to_16x32_mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovb2m %xmm1, %k1
+; CHECK-NEXT:    vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_16x8_to_16x32_mask:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm1, %xmm1
+; SKX-NEXT:    vpmovb2m %xmm1, %k1
+; SKX-NEXT:    vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; SKX-NEXT:    retq
+  %x   = zext <16 x i8> %a to <16 x i32>
+  %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @sext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: sext_16x8_to_16x32_mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovb2m %xmm1, %k1
+; CHECK-NEXT:    vpmovsxbd %xmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: sext_16x8_to_16x32_mask:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm1, %xmm1
+; SKX-NEXT:    vpmovb2m %xmm1, %k1
+; SKX-NEXT:    vpmovsxbd %xmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %x   = sext <16 x i8> %a to <16 x i32>
+  %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @zext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
+; CHECK-LABEL: zext_16x8_to_16x32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %x = zext <16 x i8> %i to <16 x i32>
+  ret <16 x i32> %x
+}
+
+define <16 x i32> @sext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
+; CHECK-LABEL: sext_16x8_to_16x32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxbd %xmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %x = sext <16 x i8> %i to <16 x i32>
+  ret <16 x i32> %x
+}
+
+define <2 x i64> @zext_2x8mem_to_2x64(<2 x i8> *%i , <2 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: zext_2x8mem_to_2x64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vptestmq %xmm0, %xmm0, %k1
+; CHECK-NEXT:    vpmovzxbq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_2x8mem_to_2x64:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllq $63, %xmm0, %xmm0
+; SKX-NEXT:    vptestmq %xmm0, %xmm0, %k1
+; SKX-NEXT:    vpmovzxbq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
+; SKX-NEXT:    retq
+  %a   = load <2 x i8>,<2 x i8> *%i,align 1
+  %x   = zext <2 x i8> %a to <2 x i64>
+  %ret = select <2 x  i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
+  ret <2 x i64> %ret
+}
+define <2 x i64> @sext_2x8mem_to_2x64mask(<2 x i8> *%i , <2 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: sext_2x8mem_to_2x64mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vptestmq %xmm0, %xmm0, %k1
+; CHECK-NEXT:    vpmovsxbq (%rdi), %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: sext_2x8mem_to_2x64mask:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllq $63, %xmm0, %xmm0
+; SKX-NEXT:    vptestmq %xmm0, %xmm0, %k1
+; SKX-NEXT:    vpmovsxbq (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <2 x i8>,<2 x i8> *%i,align 1
+  %x   = sext <2 x i8> %a to <2 x i64>
+  %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
+  ret <2 x i64> %ret
+}
+define <2 x i64> @sext_2x8mem_to_2x64(<2 x i8> *%i) nounwind readnone {
+; CHECK-LABEL: sext_2x8mem_to_2x64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxbq (%rdi), %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a   = load <2 x i8>,<2 x i8> *%i,align 1
+  %x   = sext <2 x i8> %a to <2 x i64>
+  ret <2 x i64> %x
+}
+
+define <4 x i64> @zext_4x8mem_to_4x64(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: zext_4x8mem_to_4x64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; CHECK-NEXT:    vpmovzxbq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_4x8mem_to_4x64:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT:    vpmovzxbq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
+; SKX-NEXT:    retq
+  %a   = load <4 x i8>,<4 x i8> *%i,align 1
+  %x   = zext <4 x i8> %a to <4 x i64>
+  %ret = select <4 x  i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
+  ret <4 x i64> %ret
+}
+
+define <4 x i64> @sext_4x8mem_to_4x64mask(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: sext_4x8mem_to_4x64mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; CHECK-NEXT:    vpmovsxbq (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: sext_4x8mem_to_4x64mask:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT:    vpmovsxbq (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <4 x i8>,<4 x i8> *%i,align 1
+  %x   = sext <4 x i8> %a to <4 x i64>
+  %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
+  ret <4 x i64> %ret
+}
+
+define <4 x i64> @sext_4x8mem_to_4x64(<4 x i8> *%i) nounwind readnone {
+; CHECK-LABEL: sext_4x8mem_to_4x64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxbq (%rdi), %ymm0 # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a   = load <4 x i8>,<4 x i8> *%i,align 1
+  %x   = sext <4 x i8> %a to <4 x i64>
+  ret <4 x i64> %x
+}
+
+define <8 x i64> @zext_8x8mem_to_8x64(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: zext_8x8mem_to_8x64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovw2m %xmm0, %k1
+; CHECK-NEXT:    vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_8x8mem_to_8x64:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k1
+; SKX-NEXT:    vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
+; SKX-NEXT:    retq
+  %a   = load <8 x i8>,<8 x i8> *%i,align 1
+  %x   = zext <8 x i8> %a to <8 x i64>
+  %ret = select <8 x  i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
+  ret <8 x i64> %ret
+}
+
+define <8 x i64> @sext_8x8mem_to_8x64mask(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: sext_8x8mem_to_8x64mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovw2m %xmm0, %k1
+; CHECK-NEXT:    vpmovsxbq (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: sext_8x8mem_to_8x64mask:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k1
+; SKX-NEXT:    vpmovsxbq (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <8 x i8>,<8 x i8> *%i,align 1
+  %x   = sext <8 x i8> %a to <8 x i64>
+  %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
+  ret <8 x i64> %ret
+}
+
+define <8 x i64> @sext_8x8mem_to_8x64(<8 x i8> *%i) nounwind readnone {
+; CHECK-LABEL: sext_8x8mem_to_8x64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxbq (%rdi), %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a   = load <8 x i8>,<8 x i8> *%i,align 1
+  %x   = sext <8 x i8> %a to <8 x i64>
+  ret <8 x i64> %x
+}
+
+define <4 x i32> @zext_4x16mem_to_4x32(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: zext_4x16mem_to_4x32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_4x16mem_to_4x32:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT:    vpmovzxwd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; SKX-NEXT:    retq
+  %a   = load <4 x i16>,<4 x i16> *%i,align 1
+  %x   = zext <4 x i16> %a to <4 x i32>
+  %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer
+  ret <4 x i32> %ret
+}
+
+define <4 x i32> @sext_4x16mem_to_4x32mask(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: sext_4x16mem_to_4x32mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; CHECK-NEXT:    vpmovsxwd (%rdi), %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: sext_4x16mem_to_4x32mask:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT:    vpmovsxwd (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <4 x i16>,<4 x i16> *%i,align 1
+  %x   = sext <4 x i16> %a to <4 x i32>
+  %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer
+  ret <4 x i32> %ret
+}
+
+define <4 x i32> @sext_4x16mem_to_4x32(<4 x i16> *%i) nounwind readnone {
+; CHECK-LABEL: sext_4x16mem_to_4x32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxwd (%rdi), %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a   = load <4 x i16>,<4 x i16> *%i,align 1
+  %x   = sext <4 x i16> %a to <4 x i32>
+  ret <4 x i32> %x
+}
+
+
+define <8 x i32> @zext_8x16mem_to_8x32(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: zext_8x16mem_to_8x32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovw2m %xmm0, %k1
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_8x16mem_to_8x32:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k1
+; SKX-NEXT:    vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; SKX-NEXT:    retq
+  %a   = load <8 x i16>,<8 x i16> *%i,align 1
+  %x   = zext <8 x i16> %a to <8 x i32>
+  %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @sext_8x16mem_to_8x32mask(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: sext_8x16mem_to_8x32mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovw2m %xmm0, %k1
+; CHECK-NEXT:    vpmovsxwd (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: sext_8x16mem_to_8x32mask:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k1
+; SKX-NEXT:    vpmovsxwd (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <8 x i16>,<8 x i16> *%i,align 1
+  %x   = sext <8 x i16> %a to <8 x i32>
+  %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @sext_8x16mem_to_8x32(<8 x i16> *%i) nounwind readnone {
+; CHECK-LABEL: sext_8x16mem_to_8x32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxwd (%rdi), %ymm0 # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a   = load <8 x i16>,<8 x i16> *%i,align 1
+  %x   = sext <8 x i16> %a to <8 x i32>
+  ret <8 x i32> %x
+}
+
+define <8 x i32> @zext_8x16_to_8x32mask(<8 x i16> %a , <8 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: zext_8x16_to_8x32mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $15, %xmm1, %xmm1 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovw2m %xmm1, %k1
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_8x16_to_8x32mask:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm1, %xmm1
+; SKX-NEXT:    vpmovw2m %xmm1, %k1
+; SKX-NEXT:    vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SKX-NEXT:    retq
+  %x   = zext <8 x i16> %a to <8 x i32>
+  %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @zext_8x16_to_8x32(<8 x i16> %a ) nounwind readnone {
+; CHECK-LABEL: zext_8x16_to_8x32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %x   = zext <8 x i16> %a to <8 x i32>
+  ret <8 x i32> %x
+}
+
+define <16 x i32> @zext_16x16mem_to_16x32(<16 x i16> *%i , <16 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: zext_16x16mem_to_16x32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovb2m %xmm0, %k1
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_16x16mem_to_16x32:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT:    vpmovb2m %xmm0, %k1
+; SKX-NEXT:    vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
+; SKX-NEXT:    retq
+  %a   = load <16 x i16>,<16 x i16> *%i,align 1
+  %x   = zext <16 x i16> %a to <16 x i32>
+  %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @sext_16x16mem_to_16x32mask(<16 x i16> *%i , <16 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: sext_16x16mem_to_16x32mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovb2m %xmm0, %k1
+; CHECK-NEXT:    vpmovsxwd (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: sext_16x16mem_to_16x32mask:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT:    vpmovb2m %xmm0, %k1
+; SKX-NEXT:    vpmovsxwd (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <16 x i16>,<16 x i16> *%i,align 1
+  %x   = sext <16 x i16> %a to <16 x i32>
+  %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @sext_16x16mem_to_16x32(<16 x i16> *%i) nounwind readnone {
+; CHECK-LABEL: sext_16x16mem_to_16x32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxwd (%rdi), %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a   = load <16 x i16>,<16 x i16> *%i,align 1
+  %x   = sext <16 x i16> %a to <16 x i32>
+  ret <16 x i32> %x
+}
+define <16 x i32> @zext_16x16_to_16x32mask(<16 x i16> %a , <16 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: zext_16x16_to_16x32mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovb2m %xmm1, %k1
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_16x16_to_16x32mask:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm1, %xmm1
+; SKX-NEXT:    vpmovb2m %xmm1, %k1
+; SKX-NEXT:    vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; SKX-NEXT:    retq
+  %x   = zext <16 x i16> %a to <16 x i32>
+  %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @zext_16x16_to_16x32(<16 x i16> %a ) nounwind readnone {
+; CHECK-LABEL: zext_16x16_to_16x32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %x   = zext <16 x i16> %a to <16 x i32>
+  ret <16 x i32> %x
+}
+
+define <2 x i64> @zext_2x16mem_to_2x64(<2 x i16> *%i , <2 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: zext_2x16mem_to_2x64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vptestmq %xmm0, %xmm0, %k1
+; CHECK-NEXT:    vpmovzxwq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_2x16mem_to_2x64:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllq $63, %xmm0, %xmm0
+; SKX-NEXT:    vptestmq %xmm0, %xmm0, %k1
+; SKX-NEXT:    vpmovzxwq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; SKX-NEXT:    retq
+  %a   = load <2 x i16>,<2 x i16> *%i,align 1
+  %x   = zext <2 x i16> %a to <2 x i64>
+  %ret = select <2 x  i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
+  ret <2 x i64> %ret
+}
+
+define <2 x i64> @sext_2x16mem_to_2x64mask(<2 x i16> *%i , <2 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: sext_2x16mem_to_2x64mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vptestmq %xmm0, %xmm0, %k1
+; CHECK-NEXT:    vpmovsxwq (%rdi), %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: sext_2x16mem_to_2x64mask:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllq $63, %xmm0, %xmm0
+; SKX-NEXT:    vptestmq %xmm0, %xmm0, %k1
+; SKX-NEXT:    vpmovsxwq (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <2 x i16>,<2 x i16> *%i,align 1
+  %x   = sext <2 x i16> %a to <2 x i64>
+  %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
+  ret <2 x i64> %ret
+}
+
+define <2 x i64> @sext_2x16mem_to_2x64(<2 x i16> *%i) nounwind readnone {
+; CHECK-LABEL: sext_2x16mem_to_2x64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxwq (%rdi), %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a   = load <2 x i16>,<2 x i16> *%i,align 1
+  %x   = sext <2 x i16> %a to <2 x i64>
+  ret <2 x i64> %x
+}
+
+define <4 x i64> @zext_4x16mem_to_4x64(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: zext_4x16mem_to_4x64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; CHECK-NEXT:    vpmovzxwq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_4x16mem_to_4x64:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT:    vpmovzxwq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; SKX-NEXT:    retq
+  %a   = load <4 x i16>,<4 x i16> *%i,align 1
+  %x   = zext <4 x i16> %a to <4 x i64>
+  %ret = select <4 x  i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
+  ret <4 x i64> %ret
+}
+
+define <4 x i64> @sext_4x16mem_to_4x64mask(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: sext_4x16mem_to_4x64mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; CHECK-NEXT:    vpmovsxwq (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: sext_4x16mem_to_4x64mask:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT:    vpmovsxwq (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <4 x i16>,<4 x i16> *%i,align 1
+  %x   = sext <4 x i16> %a to <4 x i64>
+  %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
+  ret <4 x i64> %ret
+}
+
+define <4 x i64> @sext_4x16mem_to_4x64(<4 x i16> *%i) nounwind readnone {
+; CHECK-LABEL: sext_4x16mem_to_4x64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxwq (%rdi), %ymm0 # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a   = load <4 x i16>,<4 x i16> *%i,align 1
+  %x   = sext <4 x i16> %a to <4 x i64>
+  ret <4 x i64> %x
+}
+
+define <8 x i64> @zext_8x16mem_to_8x64(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: zext_8x16mem_to_8x64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovw2m %xmm0, %k1
+; CHECK-NEXT:    vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_8x16mem_to_8x64:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k1
+; SKX-NEXT:    vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
+; SKX-NEXT:    retq
+  %a   = load <8 x i16>,<8 x i16> *%i,align 1
+  %x   = zext <8 x i16> %a to <8 x i64>
+  %ret = select <8 x  i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
+  ret <8 x i64> %ret
+}
+
+define <8 x i64> @sext_8x16mem_to_8x64mask(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: sext_8x16mem_to_8x64mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovw2m %xmm0, %k1
+; CHECK-NEXT:    vpmovsxwq (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: sext_8x16mem_to_8x64mask:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k1
+; SKX-NEXT:    vpmovsxwq (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <8 x i16>,<8 x i16> *%i,align 1
+  %x   = sext <8 x i16> %a to <8 x i64>
+  %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
+  ret <8 x i64> %ret
+}
+
+define <8 x i64> @sext_8x16mem_to_8x64(<8 x i16> *%i) nounwind readnone {
+; CHECK-LABEL: sext_8x16mem_to_8x64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxwq (%rdi), %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a   = load <8 x i16>,<8 x i16> *%i,align 1
+  %x   = sext <8 x i16> %a to <8 x i64>
+  ret <8 x i64> %x
+}
+
+define <8 x i64> @zext_8x16_to_8x64mask(<8 x i16> %a , <8 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: zext_8x16_to_8x64mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $15, %xmm1, %xmm1 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovw2m %xmm1, %k1
+; CHECK-NEXT:    vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_8x16_to_8x64mask:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm1, %xmm1
+; SKX-NEXT:    vpmovw2m %xmm1, %k1
+; SKX-NEXT:    vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; SKX-NEXT:    retq
+  %x   = zext <8 x i16> %a to <8 x i64>
+  %ret = select <8 x  i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
+  ret <8 x i64> %ret
+}
+
+define <8 x i64> @zext_8x16_to_8x64(<8 x i16> %a) nounwind readnone {
+; CHECK-LABEL: zext_8x16_to_8x64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %ret   = zext <8 x i16> %a to <8 x i64>
+  ret <8 x i64> %ret
+}
+
+define <2 x i64> @zext_2x32mem_to_2x64(<2 x i32> *%i , <2 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: zext_2x32mem_to_2x64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vptestmq %xmm0, %xmm0, %k1
+; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_2x32mem_to_2x64:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllq $63, %xmm0, %xmm0
+; SKX-NEXT:    vptestmq %xmm0, %xmm0, %k1
+; SKX-NEXT:    vpmovzxdq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero
+; SKX-NEXT:    retq
+  %a   = load <2 x i32>,<2 x i32> *%i,align 1
+  %x   = zext <2 x i32> %a to <2 x i64>
+  %ret = select <2 x  i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
+  ret <2 x i64> %ret
+}
+
+define <2 x i64> @sext_2x32mem_to_2x64mask(<2 x i32> *%i , <2 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: sext_2x32mem_to_2x64mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vptestmq %xmm0, %xmm0, %k1
+; CHECK-NEXT:    vpmovsxdq (%rdi), %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: sext_2x32mem_to_2x64mask:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllq $63, %xmm0, %xmm0
+; SKX-NEXT:    vptestmq %xmm0, %xmm0, %k1
+; SKX-NEXT:    vpmovsxdq (%rdi), %xmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <2 x i32>,<2 x i32> *%i,align 1
+  %x   = sext <2 x i32> %a to <2 x i64>
+  %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer
+  ret <2 x i64> %ret
+}
+
+define <2 x i64> @sext_2x32mem_to_2x64(<2 x i32> *%i) nounwind readnone {
+; CHECK-LABEL: sext_2x32mem_to_2x64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxdq (%rdi), %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a   = load <2 x i32>,<2 x i32> *%i,align 1
+  %x   = sext <2 x i32> %a to <2 x i64>
+  ret <2 x i64> %x
+}
+
+define <4 x i64> @zext_4x32mem_to_4x64(<4 x i32> *%i , <4 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: zext_4x32mem_to_4x64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; CHECK-NEXT:    vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_4x32mem_to_4x64:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT:    vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; SKX-NEXT:    retq
+  %a   = load <4 x i32>,<4 x i32> *%i,align 1
+  %x   = zext <4 x i32> %a to <4 x i64>
+  %ret = select <4 x  i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
+  ret <4 x i64> %ret
+}
+
+define <4 x i64> @sext_4x32mem_to_4x64mask(<4 x i32> *%i , <4 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: sext_4x32mem_to_4x64mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; CHECK-NEXT:    vpmovsxdq (%rdi), %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: sext_4x32mem_to_4x64mask:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT:    vpmovsxdq (%rdi), %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <4 x i32>,<4 x i32> *%i,align 1
+  %x   = sext <4 x i32> %a to <4 x i64>
+  %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
+  ret <4 x i64> %ret
+}
+
+define <4 x i64> @sext_4x32mem_to_4x64(<4 x i32> *%i) nounwind readnone {
+; CHECK-LABEL: sext_4x32mem_to_4x64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxdq (%rdi), %ymm0 # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a   = load <4 x i32>,<4 x i32> *%i,align 1
+  %x   = sext <4 x i32> %a to <4 x i64>
+  ret <4 x i64> %x
+}
+
+define <4 x i64> @sext_4x32_to_4x64(<4 x i32> %a) nounwind readnone {
+; CHECK-LABEL: sext_4x32_to_4x64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxdq %xmm0, %ymm0 # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %x   = sext <4 x i32> %a to <4 x i64>
+  ret <4 x i64> %x
+}
+
+define <4 x i64> @zext_4x32_to_4x64mask(<4 x i32> %a , <4 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: zext_4x32_to_4x64mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpslld $31, %xmm1, %xmm1 # sched: [1:1.00]
+; CHECK-NEXT:    vptestmd %xmm1, %xmm1, %k1
+; CHECK-NEXT:    vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_4x32_to_4x64mask:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
+; SKX-NEXT:    vptestmd %xmm1, %xmm1, %k1
+; SKX-NEXT:    vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SKX-NEXT:    retq
+  %x   = zext <4 x i32> %a to <4 x i64>
+  %ret = select <4 x  i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer
+  ret <4 x i64> %ret
+}
+
+define <8 x i64> @zext_8x32mem_to_8x64(<8 x i32> *%i , <8 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: zext_8x32mem_to_8x64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovw2m %xmm0, %k1
+; CHECK-NEXT:    vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_8x32mem_to_8x64:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k1
+; SKX-NEXT:    vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; SKX-NEXT:    retq
+  %a   = load <8 x i32>,<8 x i32> *%i,align 1
+  %x   = zext <8 x i32> %a to <8 x i64>
+  %ret = select <8 x  i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
+  ret <8 x i64> %ret
+}
+
+define <8 x i64> @sext_8x32mem_to_8x64mask(<8 x i32> *%i , <8 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: sext_8x32mem_to_8x64mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovw2m %xmm0, %k1
+; CHECK-NEXT:    vpmovsxdq (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: sext_8x32mem_to_8x64mask:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k1
+; SKX-NEXT:    vpmovsxdq (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a   = load <8 x i32>,<8 x i32> *%i,align 1
+  %x   = sext <8 x i32> %a to <8 x i64>
+  %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
+  ret <8 x i64> %ret
+}
+
+define <8 x i64> @sext_8x32mem_to_8x64(<8 x i32> *%i) nounwind readnone {
+; CHECK-LABEL: sext_8x32mem_to_8x64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxdq (%rdi), %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a   = load <8 x i32>,<8 x i32> *%i,align 1
+  %x   = sext <8 x i32> %a to <8 x i64>
+  ret <8 x i64> %x
+}
+
+define <8 x i64> @sext_8x32_to_8x64(<8 x i32> %a) nounwind readnone {
+; CHECK-LABEL: sext_8x32_to_8x64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxdq %ymm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %x   = sext <8 x i32> %a to <8 x i64>
+  ret <8 x i64> %x
+}
+
+define <8 x i64> @zext_8x32_to_8x64mask(<8 x i32> %a , <8 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: zext_8x32_to_8x64mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $15, %xmm1, %xmm1 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovw2m %xmm1, %k1
+; CHECK-NEXT:    vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_8x32_to_8x64mask:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm1, %xmm1
+; SKX-NEXT:    vpmovw2m %xmm1, %k1
+; SKX-NEXT:    vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
+; SKX-NEXT:    retq
+  %x   = zext <8 x i32> %a to <8 x i64>
+  %ret = select <8 x  i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer
+  ret <8 x i64> %ret
+}
+define <8 x float> @fptrunc_test(<8 x double> %a) nounwind readnone {
+; CHECK-LABEL: fptrunc_test:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtpd2ps %zmm0, %ymm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = fptrunc <8 x double> %a to <8 x float>
+  ret <8 x float> %b
+}
+
+define <8 x double> @fpext_test(<8 x float> %a) nounwind readnone {
+; CHECK-LABEL: fpext_test:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcvtps2pd %ymm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = fpext <8 x float> %a to <8 x double>
+  ret <8 x double> %b
+}
+
+define   <16 x i32> @zext_16i1_to_16xi32(i16 %b) {
+; CHECK-LABEL: zext_16i1_to_16xi32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_16i1_to_16xi32:
+; SKX:       # BB#0:
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a = bitcast i16 %b to <16 x i1>
+  %c = zext <16 x i1> %a to <16 x i32>
+  ret <16 x i32> %c
+}
+
+define   <8 x i64> @zext_8i1_to_8xi64(i8 %b) {
+; CHECK-LABEL: zext_8i1_to_8xi64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_8i1_to_8xi64:
+; SKX:       # BB#0:
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %a = bitcast i8 %b to <8 x i1>
+  %c = zext <8 x i1> %a to <8 x i64>
+  ret <8 x i64> %c
+}
+
+define i16 @trunc_16i8_to_16i1(<16 x i8> %a) {
+; CHECK-LABEL: trunc_16i8_to_16i1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovb2m %xmm0, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: trunc_16i8_to_16i1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT:    vpmovb2m %xmm0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT:    retq
+  %mask_b = trunc <16 x i8>%a to <16 x i1>
+  %mask = bitcast <16 x i1> %mask_b to i16
+  ret i16 %mask
+}
+
+define i16 @trunc_16i32_to_16i1(<16 x i32> %a) {
+; CHECK-LABEL: trunc_16i32_to_16i1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpslld $31, %zmm0, %zmm0
+; CHECK-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: trunc_16i32_to_16i1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpslld $31, %zmm0, %zmm0
+; SKX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %mask_b = trunc <16 x i32>%a to <16 x i1>
+  %mask = bitcast <16 x i1> %mask_b to i16
+  ret i16 %mask
+}
+
+define <4 x i32> @trunc_4i32_to_4i1(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: trunc_4i32_to_4i1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; CHECK-NEXT:    vpslld $31, %xmm1, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vptestmd %xmm0, %xmm0, %k0 {%k1}
+; CHECK-NEXT:    vpmovm2d %k0, %xmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: trunc_4i32_to_4i1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k1
+; SKX-NEXT:    vpslld $31, %xmm1, %xmm0
+; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k0 {%k1}
+; SKX-NEXT:    vpmovm2d %k0, %xmm0
+; SKX-NEXT:    retq
+  %mask_a = trunc <4 x i32>%a to <4 x i1>
+  %mask_b = trunc <4 x i32>%b to <4 x i1>
+  %a_and_b = and <4 x i1>%mask_a, %mask_b
+  %res = sext <4 x i1>%a_and_b to <4 x i32>
+  ret <4 x i32>%res
+}
+
+
+define i8 @trunc_8i16_to_8i1(<8 x i16> %a) {
+; CHECK-LABEL: trunc_8i16_to_8i1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovw2m %xmm0, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: trunc_8i16_to_8i1:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT:    retq
+  %mask_b = trunc <8 x i16>%a to <8 x i1>
+  %mask = bitcast <8 x i1> %mask_b to i8
+  ret i8 %mask
+}
+
+define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind {
+; CHECK-LABEL: sext_8i1_8i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmpled %ymm0, %ymm1, %k0
+; CHECK-NEXT:    vpmovm2d %k0, %ymm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: sext_8i1_8i32:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpcmpled %ymm0, %ymm1, %k0
+; SKX-NEXT:    vpmovm2d %k0, %ymm0
+; SKX-NEXT:    retq
+  %x = icmp slt <8 x i32> %a1, %a2
+  %x1 = xor <8 x i1>%x, <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+  %y = sext <8 x i1> %x1 to <8 x i32>
+  ret <8 x i32> %y
+}
+
+
+define i16 @trunc_i32_to_i1(i32 %a) {
+; CHECK-LABEL: trunc_i32_to_i1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-4, %ax # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k0
+; CHECK-NEXT:    kshiftrw $1, %k0, %k0
+; CHECK-NEXT:    kshiftlw $1, %k0, %k0
+; CHECK-NEXT:    andl $1, %edi # sched: [1:0.25]
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    korw %k1, %k0, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: trunc_i32_to_i1:
+; SKX:       # BB#0:
+; SKX-NEXT:    movw $-4, %ax
+; SKX-NEXT:    kmovd %eax, %k0
+; SKX-NEXT:    kshiftrw $1, %k0, %k0
+; SKX-NEXT:    kshiftlw $1, %k0, %k0
+; SKX-NEXT:    andl $1, %edi
+; SKX-NEXT:    kmovw %edi, %k1
+; SKX-NEXT:    korw %k1, %k0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT:    retq
+  %a_i = trunc i32 %a to i1
+  %maskv = insertelement <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %a_i, i32 0
+  %res = bitcast <16 x i1> %maskv to i16
+  ret i16 %res
+}
+
+define <8 x i16> @sext_8i1_8i16(<8 x i32> %a1, <8 x i32> %a2) nounwind {
+; CHECK-LABEL: sext_8i1_8i16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmpgtd %ymm0, %ymm1, %k0
+; CHECK-NEXT:    vpmovm2w %k0, %xmm0
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: sext_8i1_8i16:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpcmpgtd %ymm0, %ymm1, %k0
+; SKX-NEXT:    vpmovm2w %k0, %xmm0
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %x = icmp slt <8 x i32> %a1, %a2
+  %y = sext <8 x i1> %x to <8 x i16>
+  ret <8 x i16> %y
+}
+
+define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind {
+; CHECK-LABEL: sext_16i1_16i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
+; CHECK-NEXT:    vpmovm2d %k0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: sext_16i1_16i32:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpcmpgtd %zmm0, %zmm1, %k0
+; SKX-NEXT:    vpmovm2d %k0, %zmm0
+; SKX-NEXT:    retq
+  %x = icmp slt <16 x i32> %a1, %a2
+  %y = sext <16 x i1> %x to <16 x i32>
+  ret <16 x i32> %y
+}
+
+define <8 x i64> @sext_8i1_8i64(<8 x i32> %a1, <8 x i32> %a2) nounwind {
+; CHECK-LABEL: sext_8i1_8i64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmpgtd %ymm0, %ymm1, %k0
+; CHECK-NEXT:    vpmovm2q %k0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: sext_8i1_8i64:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpcmpgtd %ymm0, %ymm1, %k0
+; SKX-NEXT:    vpmovm2q %k0, %zmm0
+; SKX-NEXT:    retq
+  %x = icmp slt <8 x i32> %a1, %a2
+  %y = sext <8 x i1> %x to <8 x i64>
+  ret <8 x i64> %y
+}
+
+define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) {
+; CHECK-LABEL: extload_v8i64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovsxbq (%rdi), %zmm0
+; CHECK-NEXT:    vmovdqa64 %zmm0, (%rsi)
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: extload_v8i64:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpmovsxbq (%rdi), %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm0, (%rsi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %sign_load = load <8 x i8>, <8 x i8>* %a
+  %c = sext <8 x i8> %sign_load to <8 x i64>
+  store <8 x i64> %c, <8 x i64>* %res
+  ret void
+}
+
+define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: test21:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $7, %zmm2, %zmm2
+; CHECK-NEXT:    vpmovb2m %zmm2, %k1
+; CHECK-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    kshiftrq $32, %k1, %k1
+; CHECK-NEXT:    vmovdqu16 %zmm1, %zmm1 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test21:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $7, %zmm2, %zmm2
+; SKX-NEXT:    vpmovb2m %zmm2, %k1
+; SKX-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    kshiftrq $32, %k1, %k1
+; SKX-NEXT:    vmovdqu16 %zmm1, %zmm1 {%k1} {z}
+; SKX-NEXT:    retq
+  %ret = select <64 x i1> %mask, <64 x i16> %x, <64 x i16> zeroinitializer
+  ret <64 x i16> %ret
+}
+
+define <16 x i16> @shuffle_zext_16x8_to_16x16(<16 x i8> %a) nounwind readnone {
+; CHECK-LABEL: shuffle_zext_16x8_to_16x16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %1 = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <32 x i32> <i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7, i32 16, i32 8, i32 16, i32 9, i32 16, i32 10, i32 16, i32 11, i32 16, i32 12, i32 16, i32 13, i32 16, i32 14, i32 16, i32 15, i32 16>
+  %2 = bitcast <32 x i8> %1 to <16 x i16>
+  ret <16 x i16> %2
+}
+
+define <16 x i16> @shuffle_zext_16x8_to_16x16_mask(<16 x i8> %a, <16 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: shuffle_zext_16x8_to_16x16_mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovb2m %xmm1, %k1
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: shuffle_zext_16x8_to_16x16_mask:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm1, %xmm1
+; SKX-NEXT:    vpmovb2m %xmm1, %k1
+; SKX-NEXT:    vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; SKX-NEXT:    retq
+  %x   = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <32 x i32> <i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7, i32 16, i32 8, i32 16, i32 9, i32 16, i32 10, i32 16, i32 11, i32 16, i32 12, i32 16, i32 13, i32 16, i32 14, i32 16, i32 15, i32 16>
+  %bc  = bitcast <32 x i8> %x to <16 x i16>
+  %ret = select <16 x i1> %mask, <16 x i16> %bc, <16 x i16> zeroinitializer
+  ret <16 x i16> %ret
+}
+
+define <16 x i16> @zext_32x8_to_16x16(<32 x i8> %a) {
+; CHECK-LABEL: zext_32x8_to_16x16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %1 = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 1, i32 32, i32 2, i32 32, i32 3, i32 32, i32 4, i32 32, i32 5, i32 32, i32 6, i32 32, i32 7, i32 32, i32 8, i32 32, i32 9, i32 32, i32 10, i32 32, i32 11, i32 32, i32 12, i32 32, i32 13, i32 32, i32 14, i32 32, i32 15, i32 32>
+  %2 = bitcast <32 x i8> %1 to <16 x i16>
+  ret <16 x i16> %2
+}
+
+define <8 x i32> @zext_32x8_to_8x32(<32 x i8> %a) {
+; CHECK-LABEL: zext_32x8_to_8x32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %1 = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 1, i32 32, i32 32, i32 32, i32 2, i32 32, i32 32, i32 32, i32 3, i32 32, i32 32, i32 32, i32 4, i32 32, i32 32, i32 32, i32 5, i32 32, i32 32, i32 32, i32 6, i32 32, i32 32, i32 32, i32 7, i32 32, i32 32, i32 32>
+  %2 = bitcast <32 x i8> %1 to <8 x i32>
+  ret <8 x i32> %2
+}
+
+define <4 x i64> @zext_32x8_to_4x64(<32 x i8> %a) {
+; CHECK-LABEL: zext_32x8_to_4x64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %1 = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 1, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 2, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 3, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+  %2 = bitcast <32 x i8> %1 to <4 x i64>
+  ret <4 x i64> %2
+}
+
+define <8 x i32> @zext_16x16_to_8x32(<16 x i16> %a) {
+; CHECK-LABEL: zext_16x16_to_8x32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %1 = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 1, i32 16, i32 2, i32 16, i32 3, i32 16, i32 4, i32 16, i32 5, i32 16, i32 6, i32 16, i32 7, i32 16>
+  %2 = bitcast <16 x i16> %1 to <8 x i32>
+  ret <8 x i32> %2
+}
+
+define <4 x i64> @zext_16x16_to_4x64(<16 x i16> %a) {
+; CHECK-LABEL: zext_16x16_to_4x64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %1 = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 16, i32 16, i32 1, i32 16, i32 16, i32 16, i32 2, i32 16, i32 16, i32 16, i32 3, i32 16, i32 16, i32 16>
+  %2 = bitcast <16 x i16> %1 to <4 x i64>
+  ret <4 x i64> %2
+}
+
+define <4 x i64> @zext_8x32_to_4x64(<8 x i32> %a) {
+; CHECK-LABEL: zext_8x32_to_4x64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %1 = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8>
+  %2 = bitcast <8 x i32> %1 to <4 x i64>
+  ret <4 x i64> %2
+}
+
+define <64 x i8> @zext_64xi1_to_64xi8(<64 x i8> %x, <64 x i8> %y) #0 {
+; CHECK-LABEL: zext_64xi1_to_64xi8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmpeqb %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_64xi1_to_64xi8:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpcmpeqb %zmm1, %zmm0, %k1
+; SKX-NEXT:    vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %mask = icmp eq <64 x i8> %x, %y
+  %1 = zext <64 x i1> %mask to <64 x i8>
+  ret <64 x i8> %1
+}
+
+define <32 x i16> @zext_32xi1_to_32xi16(<32 x i16> %x, <32 x i16> %y) #0 {
+; CHECK-LABEL: zext_32xi1_to_32xi16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_32xi1_to_32xi16:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
+; SKX-NEXT:    vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %mask = icmp eq <32 x i16> %x, %y
+  %1 = zext <32 x i1> %mask to <32 x i16>
+  ret <32 x i16> %1
+}
+
+define <16 x i16> @zext_16xi1_to_16xi16(<16 x i16> %x, <16 x i16> %y) #0 {
+; CHECK-LABEL: zext_16xi1_to_16xi16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vmovdqu16 {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_16xi1_to_16xi16:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpcmpeqw %ymm1, %ymm0, %k1
+; SKX-NEXT:    vmovdqu16 {{.*}}(%rip), %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %mask = icmp eq <16 x i16> %x, %y
+  %1 = zext <16 x i1> %mask to <16 x i16>
+  ret <16 x i16> %1
+}
+
+
+define <32 x i8> @zext_32xi1_to_32xi8(<32 x i16> %x, <32 x i16> %y) #0 {
+; CHECK-LABEL: zext_32xi1_to_32xi8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vmovdqu8 {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_32xi1_to_32xi8:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
+; SKX-NEXT:    vmovdqu8 {{.*}}(%rip), %ymm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %mask = icmp eq <32 x i16> %x, %y
+  %1 = zext <32 x i1> %mask to <32 x i8>
+  ret <32 x i8> %1
+}
+
+define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 {
+; CHECK-LABEL: zext_4xi1_to_4x32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] sched: [1:0.50]
+; CHECK-NEXT:    vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_4xi1_to_4x32:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SKX-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; SKX-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; SKX-NEXT:    vpcmpeqd %xmm1, %xmm0, %k1
+; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %mask = icmp eq <4 x i8> %x, %y
+  %1 = zext <4 x i1> %mask to <4 x i32>
+  ret <4 x i32> %1
+}
+
+define <2 x i64> @zext_2xi1_to_2xi64(<2 x i8> %x, <2 x i8> %y) #0 {
+; CHECK-LABEL: zext_2xi1_to_2xi64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] sched: [1:0.50]
+; CHECK-NEXT:    vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpeqq %xmm1, %xmm0, %k1
+; CHECK-NEXT:    vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_2xi1_to_2xi64:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; SKX-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; SKX-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; SKX-NEXT:    vpcmpeqq %xmm1, %xmm0, %k1
+; SKX-NEXT:    vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %mask = icmp eq <2 x i8> %x, %y
+  %1 = zext <2 x i1> %mask to <2 x i64>
+  ret <2 x i64> %1
+}
+
+define <16 x float> @test_x86_fmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+; CHECK-LABEL: test_x86_fmadd_ps_z:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmulps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vaddps %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %x = fmul <16 x float> %a0, %a1
+  %res = fadd <16 x float> %x, %a2
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_x86_fmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+; CHECK-LABEL: test_x86_fmsub_ps_z:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmulps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vsubps %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %x = fmul <16 x float> %a0, %a1
+  %res = fsub <16 x float> %x, %a2
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_x86_fnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+; CHECK-LABEL: test_x86_fnmadd_ps_z:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmulps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vsubps %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %x = fmul <16 x float> %a0, %a1
+  %res = fsub <16 x float> %a2, %x
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_x86_fnmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
+; CHECK-LABEL: test_x86_fnmsub_ps_z:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmulps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; CHECK-NEXT:    vsubps %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %x = fmul <16 x float> %a0, %a1
+  %y = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00,
+                          float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00,
+                          float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00,
+                          float -0.000000e+00>, %x
+  %res = fsub <16 x float> %y, %a2
+  ret <16 x float> %res
+}
+
+define <8 x double> @test_x86_fmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+; CHECK-LABEL: test_x86_fmadd_pd_z:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vaddpd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %x = fmul <8 x double> %a0, %a1
+  %res = fadd <8 x double> %x, %a2
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_x86_fmsub_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
+; CHECK-LABEL: test_x86_fmsub_pd_z:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vsubpd %zmm2, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %x = fmul <8 x double> %a0, %a1
+  %res = fsub <8 x double> %x, %a2
+  ret <8 x double> %res
+}
+
+define double @test_x86_fmsub_213(double %a0, double %a1, double %a2) {
+; CHECK-LABEL: test_x86_fmsub_213:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmulsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; CHECK-NEXT:    vsubsd %xmm2, %xmm0, %xmm0 # sched: [4:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %x = fmul double %a0, %a1
+  %res = fsub double %x, %a2
+  ret double %res
+}
+
+define double @test_x86_fmsub_213_m(double %a0, double %a1, double * %a2_ptr) {
+; CHECK-LABEL: test_x86_fmsub_213_m:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmulsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; CHECK-NEXT:    vsubsd (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a2 = load double , double *%a2_ptr
+  %x = fmul double %a0, %a1
+  %res = fsub double %x, %a2
+  ret double %res
+}
+
+define double @test_x86_fmsub_231_m(double %a0, double %a1, double * %a2_ptr) {
+; CHECK-LABEL: test_x86_fmsub_231_m:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmulsd (%rdi), %xmm0, %xmm0 # sched: [4:0.50]
+; CHECK-NEXT:    vsubsd %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a2 = load double , double *%a2_ptr
+  %x = fmul double %a0, %a2
+  %res = fsub double %x, %a1
+  ret double %res
+}
+
+define <16 x float> @test231_br(<16 x float> %a1, <16 x float> %a2) nounwind {
+; CHECK-LABEL: test231_br:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmulps {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b1 = fmul <16 x float> %a1, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
+  %b2 = fadd <16 x float> %b1, %a2
+  ret <16 x float> %b2
+}
+
+define <16 x float> @test213_br(<16 x float> %a1, <16 x float> %a2) nounwind {
+; CHECK-LABEL: test213_br:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmulps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b1 = fmul <16 x float> %a1, %a2
+  %b2 = fadd <16 x float> %b1, <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>
+  ret <16 x float> %b2
+}
+
+;mask (a*c+b , a)
+define <16 x float> @test_x86_fmadd132_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) {
+; CHECK-LABEL: test_x86_fmadd132_ps:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $7, %xmm2, %xmm2 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovb2m %xmm2, %k1
+; CHECK-NEXT:    vmulps (%rdi), %zmm0, %zmm2
+; CHECK-NEXT:    vaddps %zmm1, %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_x86_fmadd132_ps:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm2, %xmm2
+; SKX-NEXT:    vpmovb2m %xmm2, %k1
+; SKX-NEXT:    vfmadd132ps (%rdi), %zmm1, %zmm0 {%k1}
+; SKX-NEXT:    retq
+  %a2   = load <16 x float>,<16 x float> *%a2_ptrt,align 1
+  %x = fmul <16 x float> %a0, %a2
+  %y = fadd <16 x float> %x, %a1
+  %res = select <16 x i1> %mask, <16 x float> %y, <16 x float> %a0
+  ret <16 x float> %res
+}
+
+;mask (a*c+b , b)
+define <16 x float> @test_x86_fmadd231_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) {
+; CHECK-LABEL: test_x86_fmadd231_ps:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $7, %xmm2, %xmm2 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovb2m %xmm2, %k1
+; CHECK-NEXT:    vmulps (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    vaddps %zmm1, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_x86_fmadd231_ps:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm2, %xmm2
+; SKX-NEXT:    vpmovb2m %xmm2, %k1
+; SKX-NEXT:    vfmadd231ps (%rdi), %zmm0, %zmm1 {%k1}
+; SKX-NEXT:    vmovaps %zmm1, %zmm0
+; SKX-NEXT:    retq
+  %a2   = load <16 x float>,<16 x float> *%a2_ptrt,align 1
+  %x = fmul <16 x float> %a0, %a2
+  %y = fadd <16 x float> %x, %a1
+  %res = select <16 x i1> %mask, <16 x float> %y, <16 x float> %a1
+  ret <16 x float> %res
+}
+
+;mask (b*a+c , b)
+define <16 x float> @test_x86_fmadd213_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) {
+; CHECK-LABEL: test_x86_fmadd213_ps:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $7, %xmm2, %xmm2 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovb2m %xmm2, %k1
+; CHECK-NEXT:    vmulps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vaddps (%rdi), %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_x86_fmadd213_ps:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm2, %xmm2
+; SKX-NEXT:    vpmovb2m %xmm2, %k1
+; SKX-NEXT:    vfmadd213ps (%rdi), %zmm0, %zmm1 {%k1}
+; SKX-NEXT:    vmovaps %zmm1, %zmm0
+; SKX-NEXT:    retq
+  %a2   = load <16 x float>,<16 x float> *%a2_ptrt,align 1
+  %x = fmul <16 x float> %a1, %a0
+  %y = fadd <16 x float> %x, %a2
+  %res = select <16 x i1> %mask, <16 x float> %y, <16 x float> %a1
+  ret <16 x float> %res
+}
+
+define <16 x i32> @vpandd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp {
+; CHECK-LABEL: vpandd:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; CHECK-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  ; Force the execution domain with an add.
+  %a2 = add <16 x i32> %a, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2,
+                            i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %x = and <16 x i32> %a2, %b
+  ret <16 x i32> %x
+}
+
+define <16 x i32> @vpandnd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp {
+; CHECK-LABEL: vpandnd:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; CHECK-NEXT:    vpandnq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  ; Force the execution domain with an add.
+  %a2 = add <16 x i32> %a, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3,
+                            i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %b2 = xor <16 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1,
+                            i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %x = and <16 x i32> %a2, %b2
+  ret <16 x i32> %x
+}
+
+define <16 x i32> @vpord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp {
+; CHECK-LABEL: vpord:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; CHECK-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  ; Force the execution domain with an add.
+  %a2 = add <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4,
+                            i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %x = or <16 x i32> %a2, %b
+  ret <16 x i32> %x
+}
+
+define <16 x i32> @vpxord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp {
+; CHECK-LABEL: vpxord:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; CHECK-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  ; Force the execution domain with an add.
+  %a2 = add <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5,
+                            i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  %x = xor <16 x i32> %a2, %b
+  ret <16 x i32> %x
+}
+
+define <8 x i64> @vpandq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp {
+; CHECK-LABEL: vpandq:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  ; Force the execution domain with an add.
+  %a2 = add <8 x i64> %a, <i64 6, i64 6, i64 6, i64 6, i64 6, i64 6, i64 6, i64 6>
+  %x = and <8 x i64> %a2, %b
+  ret <8 x i64> %x
+}
+
+define <8 x i64> @vpandnq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp {
+; CHECK-LABEL: vpandnq:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    vpandnq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  ; Force the execution domain with an add.
+  %a2 = add <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
+  %b2 = xor <8 x i64> %b, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+  %x = and <8 x i64> %a2, %b2
+  ret <8 x i64> %x
+}
+
+define <8 x i64> @vporq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp {
+; CHECK-LABEL: vporq:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  ; Force the execution domain with an add.
+  %a2 = add <8 x i64> %a, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+  %x = or <8 x i64> %a2, %b
+  ret <8 x i64> %x
+}
+
+define <8 x i64> @vpxorq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp {
+; CHECK-LABEL: vpxorq:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; CHECK-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  ; Force the execution domain with an add.
+  %a2 = add <8 x i64> %a, <i64 9, i64 9, i64 9, i64 9, i64 9, i64 9, i64 9, i64 9>
+  %x = xor <8 x i64> %a2, %b
+  ret <8 x i64> %x
+}
+
+define <64 x i8> @and_v64i8(<64 x i8> %a, <64 x i8> %b) {
+; CHECK-LABEL: and_v64i8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vandps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: and_v64i8:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vandps %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %res = and <64 x i8> %a, %b
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @andn_v64i8(<64 x i8> %a, <64 x i8> %b) {
+; CHECK-LABEL: andn_v64i8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vandnps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: andn_v64i8:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vandnps %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    retq
+  %b2 = xor <64 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
+                           i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
+                           i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1,
+                           i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %res = and <64 x i8> %a, %b2
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @or_v64i8(<64 x i8> %a, <64 x i8> %b) {
+; CHECK-LABEL: or_v64i8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vorps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: or_v64i8:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vorps %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %res = or <64 x i8> %a, %b
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @xor_v64i8(<64 x i8> %a, <64 x i8> %b) {
+; CHECK-LABEL: xor_v64i8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vxorps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: xor_v64i8:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vxorps %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %res = xor <64 x i8> %a, %b
+  ret <64 x i8> %res
+}
+
+define <32 x i16> @and_v32i16(<32 x i16> %a, <32 x i16> %b) {
+; CHECK-LABEL: and_v32i16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vandps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: and_v32i16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vandps %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %res = and <32 x i16> %a, %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @andn_v32i16(<32 x i16> %a, <32 x i16> %b) {
+; CHECK-LABEL: andn_v32i16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vandnps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: andn_v32i16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vandnps %zmm0, %zmm1, %zmm0
+; SKX-NEXT:    retq
+  %b2 = xor <32 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1,
+                            i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %res = and <32 x i16> %a, %b2
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @or_v32i16(<32 x i16> %a, <32 x i16> %b) {
+; CHECK-LABEL: or_v32i16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vorps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: or_v32i16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vorps %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %res = or <32 x i16> %a, %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @xor_v32i16(<32 x i16> %a, <32 x i16> %b) {
+; CHECK-LABEL: xor_v32i16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vxorps %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: xor_v32i16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vxorps %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    retq
+  %res = xor <32 x i16> %a, %b
+  ret <32 x i16> %res
+}
+
+define <16 x float> @masked_and_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask, <16 x float> %c) {
+; CHECK-LABEL: masked_and_v16f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vandps %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vaddps %zmm2, %zmm3, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: masked_and_v16f32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vandps %zmm1, %zmm0, %zmm2 {%k1}
+; SKX-NEXT:    vaddps %zmm2, %zmm3, %zmm0
+; SKX-NEXT:    retq
+  %a1 = bitcast <16 x float> %a to <16 x i32>
+  %b1 = bitcast <16 x float> %b to <16 x i32>
+  %passThru1 = bitcast <16 x float> %passThru to <16 x i32>
+  %mask1 = bitcast i16 %mask to <16 x i1>
+  %op = and <16 x i32> %a1, %b1
+  %select = select <16 x i1> %mask1, <16 x i32> %op, <16 x i32> %passThru1
+  %cast = bitcast <16 x i32> %select to <16 x float>
+  %add = fadd <16 x float> %c, %cast
+  ret <16 x float> %add
+}
+
+define <16 x float> @masked_or_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask, <16 x float> %c) {
+; CHECK-LABEL: masked_or_v16f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vandps %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vaddps %zmm2, %zmm3, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: masked_or_v16f32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vandps %zmm1, %zmm0, %zmm2 {%k1}
+; SKX-NEXT:    vaddps %zmm2, %zmm3, %zmm0
+; SKX-NEXT:    retq
+  %a1 = bitcast <16 x float> %a to <16 x i32>
+  %b1 = bitcast <16 x float> %b to <16 x i32>
+  %passThru1 = bitcast <16 x float> %passThru to <16 x i32>
+  %mask1 = bitcast i16 %mask to <16 x i1>
+  %op = and <16 x i32> %a1, %b1
+  %select = select <16 x i1> %mask1, <16 x i32> %op, <16 x i32> %passThru1
+  %cast = bitcast <16 x i32> %select to <16 x float>
+  %add = fadd <16 x float> %c, %cast
+  ret <16 x float> %add
+}
+
+define <16 x float> @masked_xor_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask, <16 x float> %c) {
+; CHECK-LABEL: masked_xor_v16f32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vandps %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vaddps %zmm2, %zmm3, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: masked_xor_v16f32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vandps %zmm1, %zmm0, %zmm2 {%k1}
+; SKX-NEXT:    vaddps %zmm2, %zmm3, %zmm0
+; SKX-NEXT:    retq
+  %a1 = bitcast <16 x float> %a to <16 x i32>
+  %b1 = bitcast <16 x float> %b to <16 x i32>
+  %passThru1 = bitcast <16 x float> %passThru to <16 x i32>
+  %mask1 = bitcast i16 %mask to <16 x i1>
+  %op = and <16 x i32> %a1, %b1
+  %select = select <16 x i1> %mask1, <16 x i32> %op, <16 x i32> %passThru1
+  %cast = bitcast <16 x i32> %select to <16 x float>
+  %add = fadd <16 x float> %c, %cast
+  ret <16 x float> %add
+}
+
+define <8 x double> @masked_and_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %passThru, i8 %mask, <8 x double> %c) {
+; CHECK-LABEL: masked_and_v8f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vandpd %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vaddpd %zmm2, %zmm3, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: masked_and_v8f64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vandpd %zmm1, %zmm0, %zmm2 {%k1}
+; SKX-NEXT:    vaddpd %zmm2, %zmm3, %zmm0
+; SKX-NEXT:    retq
+  %a1 = bitcast <8 x double> %a to <8 x i64>
+  %b1 = bitcast <8 x double> %b to <8 x i64>
+  %passThru1 = bitcast <8 x double> %passThru to <8 x i64>
+  %mask1 = bitcast i8 %mask to <8 x i1>
+  %op = and <8 x i64> %a1, %b1
+  %select = select <8 x i1> %mask1, <8 x i64> %op, <8 x i64> %passThru1
+  %cast = bitcast <8 x i64> %select to <8 x double>
+  %add = fadd <8 x double> %c, %cast
+  ret <8 x double> %add
+}
+
+define <8 x double> @masked_or_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %passThru, i8 %mask, <8 x double> %c) {
+; CHECK-LABEL: masked_or_v8f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vandpd %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vaddpd %zmm2, %zmm3, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: masked_or_v8f64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vandpd %zmm1, %zmm0, %zmm2 {%k1}
+; SKX-NEXT:    vaddpd %zmm2, %zmm3, %zmm0
+; SKX-NEXT:    retq
+  %a1 = bitcast <8 x double> %a to <8 x i64>
+  %b1 = bitcast <8 x double> %b to <8 x i64>
+  %passThru1 = bitcast <8 x double> %passThru to <8 x i64>
+  %mask1 = bitcast i8 %mask to <8 x i1>
+  %op = and <8 x i64> %a1, %b1
+  %select = select <8 x i1> %mask1, <8 x i64> %op, <8 x i64> %passThru1
+  %cast = bitcast <8 x i64> %select to <8 x double>
+  %add = fadd <8 x double> %c, %cast
+  ret <8 x double> %add
+}
+
+define <8 x double> @masked_xor_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %passThru, i8 %mask, <8 x double> %c) {
+; CHECK-LABEL: masked_xor_v8f64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vandpd %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vaddpd %zmm2, %zmm3, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: masked_xor_v8f64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vandpd %zmm1, %zmm0, %zmm2 {%k1}
+; SKX-NEXT:    vaddpd %zmm2, %zmm3, %zmm0
+; SKX-NEXT:    retq
+  %a1 = bitcast <8 x double> %a to <8 x i64>
+  %b1 = bitcast <8 x double> %b to <8 x i64>
+  %passThru1 = bitcast <8 x double> %passThru to <8 x i64>
+  %mask1 = bitcast i8 %mask to <8 x i1>
+  %op = and <8 x i64> %a1, %b1
+  %select = select <8 x i1> %mask1, <8 x i64> %op, <8 x i64> %passThru1
+  %cast = bitcast <8 x i64> %select to <8 x double>
+  %add = fadd <8 x double> %c, %cast
+  ret <8 x double> %add
+}
+
+define <8 x i64> @test_mm512_mask_and_epi32(<8 x i64> %__src, i16 zeroext %__k, <8 x i64> %__a, <8 x i64> %__b) {
+; CHECK-LABEL: test_mm512_mask_and_epi32:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vandps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_mm512_mask_and_epi32:
+; SKX:       ## BB#0: ## %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vandps %zmm2, %zmm1, %zmm0 {%k1}
+; SKX-NEXT:    retq
+entry:
+  %and1.i.i = and <8 x i64> %__a, %__b
+  %0 = bitcast <8 x i64> %and1.i.i to <16 x i32>
+  %1 = bitcast <8 x i64> %__src to <16 x i32>
+  %2 = bitcast i16 %__k to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %0, <16 x i32> %1
+  %4 = bitcast <16 x i32> %3 to <8 x i64>
+  ret <8 x i64> %4
+}
+
+define <8 x i64> @test_mm512_mask_or_epi32(<8 x i64> %__src, i16 zeroext %__k, <8 x i64> %__a, <8 x i64> %__b) {
+; CHECK-LABEL: test_mm512_mask_or_epi32:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vorps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_mm512_mask_or_epi32:
+; SKX:       ## BB#0: ## %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vorps %zmm2, %zmm1, %zmm0 {%k1}
+; SKX-NEXT:    retq
+entry:
+  %or1.i.i = or <8 x i64> %__a, %__b
+  %0 = bitcast <8 x i64> %or1.i.i to <16 x i32>
+  %1 = bitcast <8 x i64> %__src to <16 x i32>
+  %2 = bitcast i16 %__k to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %0, <16 x i32> %1
+  %4 = bitcast <16 x i32> %3 to <8 x i64>
+  ret <8 x i64> %4
+}
+
+define <8 x i64> @test_mm512_mask_xor_epi32(<8 x i64> %__src, i16 zeroext %__k, <8 x i64> %__a, <8 x i64> %__b) {
+; CHECK-LABEL: test_mm512_mask_xor_epi32:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vxorps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_mm512_mask_xor_epi32:
+; SKX:       ## BB#0: ## %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vxorps %zmm2, %zmm1, %zmm0 {%k1}
+; SKX-NEXT:    retq
+entry:
+  %xor1.i.i = xor <8 x i64> %__a, %__b
+  %0 = bitcast <8 x i64> %xor1.i.i to <16 x i32>
+  %1 = bitcast <8 x i64> %__src to <16 x i32>
+  %2 = bitcast i16 %__k to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %0, <16 x i32> %1
+  %4 = bitcast <16 x i32> %3 to <8 x i64>
+  ret <8 x i64> %4
+}
+
+define <8 x double> @test_mm512_mask_xor_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+; CHECK-LABEL: test_mm512_mask_xor_pd:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vxorpd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_mm512_mask_xor_pd:
+; SKX:       ## BB#0: ## %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vxorpd %zmm2, %zmm1, %zmm0 {%k1}
+; SKX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x double> %__A to <8 x i64>
+  %1 = bitcast <8 x double> %__B to <8 x i64>
+  %xor.i.i = xor <8 x i64> %0, %1
+  %2 = bitcast <8 x i64> %xor.i.i to <8 x double>
+  %3 = bitcast i8 %__U to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__W
+  ret <8 x double> %4
+}
+
+define <8 x double> @test_mm512_maskz_xor_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+; CHECK-LABEL: test_mm512_maskz_xor_pd:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vxorpd %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_mm512_maskz_xor_pd:
+; SKX:       ## BB#0: ## %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vxorpd %zmm1, %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x double> %__A to <8 x i64>
+  %1 = bitcast <8 x double> %__B to <8 x i64>
+  %xor.i.i = xor <8 x i64> %0, %1
+  %2 = bitcast <8 x i64> %xor.i.i to <8 x double>
+  %3 = bitcast i8 %__U to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
+  ret <8 x double> %4
+}
+
+define <16 x float> @test_mm512_mask_xor_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+; CHECK-LABEL: test_mm512_mask_xor_ps:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vxorps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_mm512_mask_xor_ps:
+; SKX:       ## BB#0: ## %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vxorps %zmm2, %zmm1, %zmm0 {%k1}
+; SKX-NEXT:    retq
+entry:
+  %0 = bitcast <16 x float> %__A to <16 x i32>
+  %1 = bitcast <16 x float> %__B to <16 x i32>
+  %xor.i.i = xor <16 x i32> %0, %1
+  %2 = bitcast <16 x i32> %xor.i.i to <16 x float>
+  %3 = bitcast i16 %__U to <16 x i1>
+  %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__W
+  ret <16 x float> %4
+}
+
+define <16 x float> @test_mm512_maskz_xor_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+; CHECK-LABEL: test_mm512_maskz_xor_ps:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vxorps %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_mm512_maskz_xor_ps:
+; SKX:       ## BB#0: ## %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vxorps %zmm1, %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+entry:
+  %0 = bitcast <16 x float> %__A to <16 x i32>
+  %1 = bitcast <16 x float> %__B to <16 x i32>
+  %xor.i.i = xor <16 x i32> %0, %1
+  %2 = bitcast <16 x i32> %xor.i.i to <16 x float>
+  %3 = bitcast i16 %__U to <16 x i1>
+  %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
+  ret <16 x float> %4
+}
+
+define <8 x double> @test_mm512_mask_or_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+; CHECK-LABEL: test_mm512_mask_or_pd:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vorpd %zmm1, %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_mm512_mask_or_pd:
+; SKX:       ## BB#0: ## %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vorpd %zmm1, %zmm2, %zmm0 {%k1}
+; SKX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x double> %__A to <8 x i64>
+  %1 = bitcast <8 x double> %__B to <8 x i64>
+  %or.i.i = or <8 x i64> %1, %0
+  %2 = bitcast <8 x i64> %or.i.i to <8 x double>
+  %3 = bitcast i8 %__U to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__W
+  ret <8 x double> %4
+}
+
+define <8 x double> @test_mm512_maskz_or_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+; CHECK-LABEL: test_mm512_maskz_or_pd:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vorpd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_mm512_maskz_or_pd:
+; SKX:       ## BB#0: ## %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vorpd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x double> %__A to <8 x i64>
+  %1 = bitcast <8 x double> %__B to <8 x i64>
+  %or.i.i = or <8 x i64> %1, %0
+  %2 = bitcast <8 x i64> %or.i.i to <8 x double>
+  %3 = bitcast i8 %__U to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
+  ret <8 x double> %4
+}
+
+define <16 x float> @test_mm512_mask_or_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+; CHECK-LABEL: test_mm512_mask_or_ps:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vorps %zmm1, %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_mm512_mask_or_ps:
+; SKX:       ## BB#0: ## %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vorps %zmm1, %zmm2, %zmm0 {%k1}
+; SKX-NEXT:    retq
+entry:
+  %0 = bitcast <16 x float> %__A to <16 x i32>
+  %1 = bitcast <16 x float> %__B to <16 x i32>
+  %or.i.i = or <16 x i32> %1, %0
+  %2 = bitcast <16 x i32> %or.i.i to <16 x float>
+  %3 = bitcast i16 %__U to <16 x i1>
+  %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__W
+  ret <16 x float> %4
+}
+
+define <16 x float> @test_mm512_maskz_or_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+; CHECK-LABEL: test_mm512_maskz_or_ps:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vorps %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_mm512_maskz_or_ps:
+; SKX:       ## BB#0: ## %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vorps %zmm0, %zmm1, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+entry:
+  %0 = bitcast <16 x float> %__A to <16 x i32>
+  %1 = bitcast <16 x float> %__B to <16 x i32>
+  %or.i.i = or <16 x i32> %1, %0
+  %2 = bitcast <16 x i32> %or.i.i to <16 x float>
+  %3 = bitcast i16 %__U to <16 x i1>
+  %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
+  ret <16 x float> %4
+}
+
+define <8 x double> @test_mm512_mask_and_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+; CHECK-LABEL: test_mm512_mask_and_pd:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vandpd %zmm1, %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_mm512_mask_and_pd:
+; SKX:       ## BB#0: ## %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vandpd %zmm1, %zmm2, %zmm0 {%k1}
+; SKX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x double> %__A to <8 x i64>
+  %1 = bitcast <8 x double> %__B to <8 x i64>
+  %and.i.i = and <8 x i64> %1, %0
+  %2 = bitcast <8 x i64> %and.i.i to <8 x double>
+  %3 = bitcast i8 %__U to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__W
+  ret <8 x double> %4
+}
+
+define <8 x double> @test_mm512_maskz_and_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+; CHECK-LABEL: test_mm512_maskz_and_pd:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vandpd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_mm512_maskz_and_pd:
+; SKX:       ## BB#0: ## %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vandpd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x double> %__A to <8 x i64>
+  %1 = bitcast <8 x double> %__B to <8 x i64>
+  %and.i.i = and <8 x i64> %1, %0
+  %2 = bitcast <8 x i64> %and.i.i to <8 x double>
+  %3 = bitcast i8 %__U to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
+  ret <8 x double> %4
+}
+
+define <16 x float> @test_mm512_mask_and_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+; CHECK-LABEL: test_mm512_mask_and_ps:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vandps %zmm1, %zmm2, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_mm512_mask_and_ps:
+; SKX:       ## BB#0: ## %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vandps %zmm1, %zmm2, %zmm0 {%k1}
+; SKX-NEXT:    retq
+entry:
+  %0 = bitcast <16 x float> %__A to <16 x i32>
+  %1 = bitcast <16 x float> %__B to <16 x i32>
+  %and.i.i = and <16 x i32> %1, %0
+  %2 = bitcast <16 x i32> %and.i.i to <16 x float>
+  %3 = bitcast i16 %__U to <16 x i1>
+  %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__W
+  ret <16 x float> %4
+}
+
+define <16 x float> @test_mm512_maskz_and_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+; CHECK-LABEL: test_mm512_maskz_and_ps:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vandps %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_mm512_maskz_and_ps:
+; SKX:       ## BB#0: ## %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vandps %zmm0, %zmm1, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+entry:
+  %0 = bitcast <16 x float> %__A to <16 x i32>
+  %1 = bitcast <16 x float> %__B to <16 x i32>
+  %and.i.i = and <16 x i32> %1, %0
+  %2 = bitcast <16 x i32> %and.i.i to <16 x float>
+  %3 = bitcast i16 %__U to <16 x i1>
+  %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
+  ret <16 x float> %4
+}
+
+define <8 x double> @test_mm512_mask_andnot_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+; CHECK-LABEL: test_mm512_mask_andnot_pd:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vandnpd %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_mm512_mask_andnot_pd:
+; SKX:       ## BB#0: ## %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vandnpd %zmm2, %zmm1, %zmm0 {%k1}
+; SKX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x double> %__A to <8 x i64>
+  %neg.i.i = xor <8 x i64> %0, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+  %1 = bitcast <8 x double> %__B to <8 x i64>
+  %and.i.i = and <8 x i64> %1, %neg.i.i
+  %2 = bitcast <8 x i64> %and.i.i to <8 x double>
+  %3 = bitcast i8 %__U to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__W
+  ret <8 x double> %4
+}
+
+define <8 x double> @test_mm512_maskz_andnot_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) {
+; CHECK-LABEL: test_mm512_maskz_andnot_pd:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vandnpd %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_mm512_maskz_andnot_pd:
+; SKX:       ## BB#0: ## %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vandnpd %zmm1, %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+entry:
+  %0 = bitcast <8 x double> %__A to <8 x i64>
+  %neg.i.i = xor <8 x i64> %0, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+  %1 = bitcast <8 x double> %__B to <8 x i64>
+  %and.i.i = and <8 x i64> %1, %neg.i.i
+  %2 = bitcast <8 x i64> %and.i.i to <8 x double>
+  %3 = bitcast i8 %__U to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer
+  ret <8 x double> %4
+}
+
+define <16 x float> @test_mm512_mask_andnot_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+; CHECK-LABEL: test_mm512_mask_andnot_ps:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vandnps %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_mm512_mask_andnot_ps:
+; SKX:       ## BB#0: ## %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vandnps %zmm2, %zmm1, %zmm0 {%k1}
+; SKX-NEXT:    retq
+entry:
+  %0 = bitcast <16 x float> %__A to <16 x i32>
+  %neg.i.i = xor <16 x i32> %0, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %1 = bitcast <16 x float> %__B to <16 x i32>
+  %and.i.i = and <16 x i32> %1, %neg.i.i
+  %2 = bitcast <16 x i32> %and.i.i to <16 x float>
+  %3 = bitcast i16 %__U to <16 x i1>
+  %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__W
+  ret <16 x float> %4
+}
+
+define <16 x float> @test_mm512_maskz_andnot_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) {
+; CHECK-LABEL: test_mm512_maskz_andnot_ps:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vandnps %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_mm512_maskz_andnot_ps:
+; SKX:       ## BB#0: ## %entry
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vandnps %zmm1, %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+entry:
+  %0 = bitcast <16 x float> %__A to <16 x i32>
+  %neg.i.i = xor <16 x i32> %0, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %1 = bitcast <16 x float> %__B to <16 x i32>
+  %and.i.i = and <16 x i32> %1, %neg.i.i
+  %2 = bitcast <16 x i32> %and.i.i to <16 x float>
+  %3 = bitcast i16 %__U to <16 x i1>
+  %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer
+  ret <16 x float> %4
+}
+
+define i32 @mov_test1(float %x) {
+; CHECK-LABEL: mov_test1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovd %xmm0, %eax # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+   %res = bitcast float %x to i32
+   ret i32 %res
+}
+
+define <4 x i32> @mov_test2(i32 %x) {
+; CHECK-LABEL: mov_test2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovd %edi, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+   %res = insertelement <4 x i32>undef, i32 %x, i32 0
+   ret <4 x i32>%res
+}
+
+define <2 x i64> @mov_test3(i64 %x) {
+; CHECK-LABEL: mov_test3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovq %rdi, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+   %res = insertelement <2 x i64>undef, i64 %x, i32 0
+   ret <2 x i64>%res
+}
+
+define <4 x i32> @mov_test4(i32* %x) {
+; CHECK-LABEL: mov_test4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [1:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+   %y = load i32, i32* %x
+   %res = insertelement <4 x i32>undef, i32 %y, i32 0
+   ret <4 x i32>%res
+}
+
+define void @mov_test5(float %x, float* %y) {
+; CHECK-LABEL: mov_test5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovss %xmm0, (%rdi) # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+   store float %x, float* %y, align 4
+   ret void
+}
+
+define void @mov_test6(double %x, double* %y) {
+; CHECK-LABEL: mov_test6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovsd %xmm0, (%rdi) # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+   store double %x, double* %y, align 8
+   ret void
+}
+
+define float @mov_test7(i32* %x) {
+; CHECK-LABEL: mov_test7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [1:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+   %y = load i32, i32* %x
+   %res = bitcast i32 %y to float
+   ret float %res
+}
+
+define i32 @mov_test8(<4 x i32> %x) {
+; CHECK-LABEL: mov_test8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovd %xmm0, %eax # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+   %res = extractelement <4 x i32> %x, i32 0
+   ret i32 %res
+}
+
+define i64 @mov_test9(<2 x i64> %x) {
+; CHECK-LABEL: mov_test9:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovq %xmm0, %rax # sched: [2:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+   %res = extractelement <2 x i64> %x, i32 0
+   ret i64 %res
+}
+
+define <4 x i32> @mov_test10(i32* %x) {
+; CHECK-LABEL: mov_test10:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [1:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+   %y = load i32, i32* %x, align 4
+   %res = insertelement <4 x i32>zeroinitializer, i32 %y, i32 0
+   ret <4 x i32>%res
+}
+
+define <4 x float> @mov_test11(float* %x) {
+; CHECK-LABEL: mov_test11:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [1:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+   %y = load float, float* %x, align 4
+   %res = insertelement <4 x float>zeroinitializer, float %y, i32 0
+   ret <4 x float>%res
+}
+
+define <2 x double> @mov_test12(double* %x) {
+; CHECK-LABEL: mov_test12:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [1:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+   %y = load double, double* %x, align 8
+   %res = insertelement <2 x double>zeroinitializer, double %y, i32 0
+   ret <2 x double>%res
+}
+
+define <2 x i64> @mov_test13(i64 %x) {
+; CHECK-LABEL: mov_test13:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovq %rdi, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+   %res = insertelement <2 x i64>zeroinitializer, i64 %x, i32 0
+   ret <2 x i64>%res
+}
+
+define <4 x i32> @mov_test14(i32 %x) {
+; CHECK-LABEL: mov_test14:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovd %edi, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+   %res = insertelement <4 x i32>zeroinitializer, i32 %x, i32 0
+   ret <4 x i32>%res
+}
+
+define <4 x i32> @mov_test15(i32* %x) {
+; CHECK-LABEL: mov_test15:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [1:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+   %y = load i32, i32* %x, align 4
+   %res = insertelement <4 x i32>zeroinitializer, i32 %y, i32 0
+   ret <4 x i32>%res
+}
+
+define <16 x i32> @mov_test16(i8 * %addr) {
+; CHECK-LABEL: mov_test16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovups (%rdi), %zmm0 # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vaddr = bitcast i8* %addr to <16 x i32>*
+  %res = load <16 x i32>, <16 x i32>* %vaddr, align 1
+  ret <16 x i32>%res
+}
+
+define <16 x i32> @mov_test17(i8 * %addr) {
+; CHECK-LABEL: mov_test17:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps (%rdi), %zmm0 # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vaddr = bitcast i8* %addr to <16 x i32>*
+  %res = load <16 x i32>, <16 x i32>* %vaddr, align 64
+  ret <16 x i32>%res
+}
+
+define void @mov_test18(i8 * %addr, <8 x i64> %data) {
+; CHECK-LABEL: mov_test18:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps %zmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vaddr = bitcast i8* %addr to <8 x i64>*
+  store <8 x i64>%data, <8 x i64>* %vaddr, align 64
+  ret void
+}
+
+define void @mov_test19(i8 * %addr, <16 x i32> %data) {
+; CHECK-LABEL: mov_test19:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovups %zmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vaddr = bitcast i8* %addr to <16 x i32>*
+  store <16 x i32>%data, <16 x i32>* %vaddr, align 1
+  ret void
+}
+
+define void @mov_test20(i8 * %addr, <16 x i32> %data) {
+; CHECK-LABEL: mov_test20:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps %zmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vaddr = bitcast i8* %addr to <16 x i32>*
+  store <16 x i32>%data, <16 x i32>* %vaddr, align 64
+  ret void
+}
+
+define  <8 x i64> @mov_test21(i8 * %addr) {
+; CHECK-LABEL: mov_test21:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps (%rdi), %zmm0 # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vaddr = bitcast i8* %addr to <8 x i64>*
+  %res = load <8 x i64>, <8 x i64>* %vaddr, align 64
+  ret <8 x i64>%res
+}
+
+define void @mov_test22(i8 * %addr, <8 x i64> %data) {
+; CHECK-LABEL: mov_test22:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovups %zmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vaddr = bitcast i8* %addr to <8 x i64>*
+  store <8 x i64>%data, <8 x i64>* %vaddr, align 1
+  ret void
+}
+
+define <8 x i64> @mov_test23(i8 * %addr) {
+; CHECK-LABEL: mov_test23:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovups (%rdi), %zmm0 # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vaddr = bitcast i8* %addr to <8 x i64>*
+  %res = load <8 x i64>, <8 x i64>* %vaddr, align 1
+  ret <8 x i64>%res
+}
+
+define void @mov_test24(i8 * %addr, <8 x double> %data) {
+; CHECK-LABEL: mov_test24:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps %zmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vaddr = bitcast i8* %addr to <8 x double>*
+  store <8 x double>%data, <8 x double>* %vaddr, align 64
+  ret void
+}
+
+define <8 x double> @mov_test25(i8 * %addr) {
+; CHECK-LABEL: mov_test25:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps (%rdi), %zmm0 # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vaddr = bitcast i8* %addr to <8 x double>*
+  %res = load <8 x double>, <8 x double>* %vaddr, align 64
+  ret <8 x double>%res
+}
+
+define void @mov_test26(i8 * %addr, <16 x float> %data) {
+; CHECK-LABEL: mov_test26:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps %zmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vaddr = bitcast i8* %addr to <16 x float>*
+  store <16 x float>%data, <16 x float>* %vaddr, align 64
+  ret void
+}
+
+define <16 x float> @mov_test27(i8 * %addr) {
+; CHECK-LABEL: mov_test27:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps (%rdi), %zmm0 # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vaddr = bitcast i8* %addr to <16 x float>*
+  %res = load <16 x float>, <16 x float>* %vaddr, align 64
+  ret <16 x float>%res
+}
+
+define void @mov_test28(i8 * %addr, <8 x double> %data) {
+; CHECK-LABEL: mov_test28:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovups %zmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vaddr = bitcast i8* %addr to <8 x double>*
+  store <8 x double>%data, <8 x double>* %vaddr, align 1
+  ret void
+}
+
+define <8 x double> @mov_test29(i8 * %addr) {
+; CHECK-LABEL: mov_test29:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovups (%rdi), %zmm0 # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vaddr = bitcast i8* %addr to <8 x double>*
+  %res = load <8 x double>, <8 x double>* %vaddr, align 1
+  ret <8 x double>%res
+}
+
+define void @mov_test30(i8 * %addr, <16 x float> %data) {
+; CHECK-LABEL: mov_test30:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovups %zmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vaddr = bitcast i8* %addr to <16 x float>*
+  store <16 x float>%data, <16 x float>* %vaddr, align 1
+  ret void
+}
+
+define <16 x float> @mov_test31(i8 * %addr) {
+; CHECK-LABEL: mov_test31:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovups (%rdi), %zmm0 # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vaddr = bitcast i8* %addr to <16 x float>*
+  %res = load <16 x float>, <16 x float>* %vaddr, align 1
+  ret <16 x float>%res
+}
+
+define <16 x i32> @mov_test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
+; CHECK-LABEL: mov_test32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm0 {%k1} # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x i32>*
+  %r = load <16 x i32>, <16 x i32>* %vaddr, align 64
+  %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> %old
+  ret <16 x i32>%res
+}
+
+define <16 x i32> @mov_test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) {
+; CHECK-LABEL: mov_test33:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x i32>*
+  %r = load <16 x i32>, <16 x i32>* %vaddr, align 1
+  %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> %old
+  ret <16 x i32>%res
+}
+
+define <16 x i32> @mov_test34(i8 * %addr, <16 x i32> %mask1) {
+; CHECK-LABEL: mov_test34:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm0 {%k1} {z} # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x i32>*
+  %r = load <16 x i32>, <16 x i32>* %vaddr, align 64
+  %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> zeroinitializer
+  ret <16 x i32>%res
+}
+
+define <16 x i32> @mov_test35(i8 * %addr, <16 x i32> %mask1) {
+; CHECK-LABEL: mov_test35:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} {z} # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x i32>*
+  %r = load <16 x i32>, <16 x i32>* %vaddr, align 1
+  %res = select <16 x i1> %mask, <16 x i32> %r, <16 x i32> zeroinitializer
+  ret <16 x i32>%res
+}
+
+define <8 x i64> @mov_test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
+; CHECK-LABEL: mov_test36:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 {%k1} # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x i64>*
+  %r = load <8 x i64>, <8 x i64>* %vaddr, align 64
+  %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> %old
+  ret <8 x i64>%res
+}
+
+define <8 x i64> @mov_test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) {
+; CHECK-LABEL: mov_test37:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqq %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1} # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x i64>*
+  %r = load <8 x i64>, <8 x i64>* %vaddr, align 1
+  %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> %old
+  ret <8 x i64>%res
+}
+
+define <8 x i64> @mov_test38(i8 * %addr, <8 x i64> %mask1) {
+; CHECK-LABEL: mov_test38:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 {%k1} {z} # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x i64>*
+  %r = load <8 x i64>, <8 x i64>* %vaddr, align 64
+  %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> zeroinitializer
+  ret <8 x i64>%res
+}
+
+define <8 x i64> @mov_test39(i8 * %addr, <8 x i64> %mask1) {
+; CHECK-LABEL: mov_test39:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vmovdqu64 (%rdi), %zmm0 {%k1} {z} # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = icmp ne <8 x i64> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x i64>*
+  %r = load <8 x i64>, <8 x i64>* %vaddr, align 1
+  %res = select <8 x i1> %mask, <8 x i64> %r, <8 x i64> zeroinitializer
+  ret <8 x i64>%res
+}
+
+define <16 x float> @mov_test40(i8 * %addr, <16 x float> %old, <16 x float> %mask1) {
+; CHECK-LABEL: mov_test40:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2 # sched: [1:0.50]
+; CHECK-NEXT:    vcmpordps %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vcmpneqps %zmm2, %zmm1, %k1 {%k1}
+; CHECK-NEXT:    vmovaps (%rdi), %zmm0 {%k1} # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = fcmp one <16 x float> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x float>*
+  %r = load <16 x float>, <16 x float>* %vaddr, align 64
+  %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> %old
+  ret <16 x float>%res
+}
+
+define <16 x float> @mov_test41(i8 * %addr, <16 x float> %old, <16 x float> %mask1) {
+; CHECK-LABEL: mov_test41:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2 # sched: [1:0.50]
+; CHECK-NEXT:    vcmpordps %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vcmpneqps %zmm2, %zmm1, %k1 {%k1}
+; CHECK-NEXT:    vmovups (%rdi), %zmm0 {%k1} # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = fcmp one <16 x float> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x float>*
+  %r = load <16 x float>, <16 x float>* %vaddr, align 1
+  %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> %old
+  ret <16 x float>%res
+}
+
+define <16 x float> @mov_test42(i8 * %addr, <16 x float> %mask1) {
+; CHECK-LABEL: mov_test42:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vcmpordps %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vcmpneqps %zmm1, %zmm0, %k1 {%k1}
+; CHECK-NEXT:    vmovaps (%rdi), %zmm0 {%k1} {z} # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = fcmp one <16 x float> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x float>*
+  %r = load <16 x float>, <16 x float>* %vaddr, align 64
+  %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> zeroinitializer
+  ret <16 x float>%res
+}
+
+define <16 x float> @mov_test43(i8 * %addr, <16 x float> %mask1) {
+; CHECK-LABEL: mov_test43:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vcmpordps %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vcmpneqps %zmm1, %zmm0, %k1 {%k1}
+; CHECK-NEXT:    vmovups (%rdi), %zmm0 {%k1} {z} # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = fcmp one <16 x float> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <16 x float>*
+  %r = load <16 x float>, <16 x float>* %vaddr, align 1
+  %res = select <16 x i1> %mask, <16 x float> %r, <16 x float> zeroinitializer
+  ret <16 x float>%res
+}
+
+define <8 x double> @mov_test44(i8 * %addr, <8 x double> %old, <8 x double> %mask1) {
+; CHECK-LABEL: mov_test44:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:0.50]
+; CHECK-NEXT:    vcmpordpd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vcmpneqpd %zmm2, %zmm1, %k1 {%k1}
+; CHECK-NEXT:    vmovapd (%rdi), %zmm0 {%k1} # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = fcmp one <8 x double> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x double>*
+  %r = load <8 x double>, <8 x double>* %vaddr, align 64
+  %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> %old
+  ret <8 x double>%res
+}
+
+define <8 x double> @mov_test45(i8 * %addr, <8 x double> %old, <8 x double> %mask1) {
+; CHECK-LABEL: mov_test45:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:0.50]
+; CHECK-NEXT:    vcmpordpd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vcmpneqpd %zmm2, %zmm1, %k1 {%k1}
+; CHECK-NEXT:    vmovupd (%rdi), %zmm0 {%k1} # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = fcmp one <8 x double> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x double>*
+  %r = load <8 x double>, <8 x double>* %vaddr, align 1
+  %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> %old
+  ret <8 x double>%res
+}
+
+define <8 x double> @mov_test46(i8 * %addr, <8 x double> %mask1) {
+; CHECK-LABEL: mov_test46:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vcmpordpd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vcmpneqpd %zmm1, %zmm0, %k1 {%k1}
+; CHECK-NEXT:    vmovapd (%rdi), %zmm0 {%k1} {z} # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = fcmp one <8 x double> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x double>*
+  %r = load <8 x double>, <8 x double>* %vaddr, align 64
+  %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> zeroinitializer
+  ret <8 x double>%res
+}
+
+define <8 x double> @mov_test47(i8 * %addr, <8 x double> %mask1) {
+; CHECK-LABEL: mov_test47:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vcmpordpd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vcmpneqpd %zmm1, %zmm0, %k1 {%k1}
+; CHECK-NEXT:    vmovupd (%rdi), %zmm0 {%k1} {z} # sched: [5:0.50]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = fcmp one <8 x double> %mask1, zeroinitializer
+  %vaddr = bitcast i8* %addr to <8 x double>*
+  %r = load <8 x double>, <8 x double>* %vaddr, align 1
+  %res = select <8 x i1> %mask, <8 x double> %r, <8 x double> zeroinitializer
+  ret <8 x double>%res
+}
+
+define i16 @mask16(i16 %x) {
+; CHECK-LABEL: mask16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovd %edi, %k0
+; CHECK-NEXT:    knotw %k0, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: mask16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovd %edi, %k0
+; SKX-NEXT:    knotw %k0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT:    retq
+  %m0 = bitcast i16 %x to <16 x i1>
+  %m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+  %ret = bitcast <16 x i1> %m1 to i16
+  ret i16 %ret
+}
+
+define i32 @mask16_zext(i16 %x) {
+; CHECK-LABEL: mask16_zext:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovd %edi, %k0
+; CHECK-NEXT:    knotw %k0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: mask16_zext:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovd %edi, %k0
+; SKX-NEXT:    knotw %k0, %k0
+; SKX-NEXT:    kmovw %k0, %eax
+; SKX-NEXT:    retq
+  %m0 = bitcast i16 %x to <16 x i1>
+  %m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+  %m2 = bitcast <16 x i1> %m1 to i16
+  %ret = zext i16 %m2 to i32
+  ret i32 %ret
+}
+
+define i8 @mask8(i8 %x) {
+; CHECK-LABEL: mask8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovd %edi, %k0
+; CHECK-NEXT:    knotb %k0, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: mask8:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovd %edi, %k0
+; SKX-NEXT:    knotb %k0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT:    retq
+  %m0 = bitcast i8 %x to <8 x i1>
+  %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+  %ret = bitcast <8 x i1> %m1 to i8
+  ret i8 %ret
+}
+
+define i32 @mask8_zext(i8 %x) {
+; CHECK-LABEL: mask8_zext:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovd %edi, %k0
+; CHECK-NEXT:    knotb %k0, %k0
+; CHECK-NEXT:    kmovb %k0, %eax
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: mask8_zext:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovd %edi, %k0
+; SKX-NEXT:    knotb %k0, %k0
+; SKX-NEXT:    kmovb %k0, %eax
+; SKX-NEXT:    retq
+  %m0 = bitcast i8 %x to <8 x i1>
+  %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+  %m2 = bitcast <8 x i1> %m1 to i8
+  %ret = zext i8 %m2 to i32
+  ret i32 %ret
+}
+
+define void @mask16_mem(i16* %ptr) {
+; CHECK-LABEL: mask16_mem:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovw (%rdi), %k0
+; CHECK-NEXT:    knotw %k0, %k0
+; CHECK-NEXT:    kmovw %k0, (%rdi)
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %x = load i16, i16* %ptr, align 4
+  %m0 = bitcast i16 %x to <16 x i1>
+  %m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+  %ret = bitcast <16 x i1> %m1 to i16
+  store i16 %ret, i16* %ptr, align 4
+  ret void
+}
+
+define void @mask8_mem(i8* %ptr) {
+; CHECK-LABEL: mask8_mem:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb (%rdi), %k0
+; CHECK-NEXT:    knotb %k0, %k0
+; CHECK-NEXT:    kmovb %k0, (%rdi)
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: mask8_mem:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovb (%rdi), %k0
+; SKX-NEXT:    knotb %k0, %k0
+; SKX-NEXT:    kmovb %k0, (%rdi)
+; SKX-NEXT:    retq
+  %x = load i8, i8* %ptr, align 4
+  %m0 = bitcast i8 %x to <8 x i1>
+  %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
+  %ret = bitcast <8 x i1> %m1 to i8
+  store i8 %ret, i8* %ptr, align 4
+  ret void
+}
+
+define i16 @mand16(i16 %x, i16 %y) {
+; CHECK-LABEL: mand16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; CHECK-NEXT:    xorl %esi, %eax # sched: [1:0.25]
+; CHECK-NEXT:    andl %esi, %edi # sched: [1:0.25]
+; CHECK-NEXT:    orl %eax, %edi # sched: [1:0.25]
+; CHECK-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %ma = bitcast i16 %x to <16 x i1>
+  %mb = bitcast i16 %y to <16 x i1>
+  %mc = and <16 x i1> %ma, %mb
+  %md = xor <16 x i1> %ma, %mb
+  %me = or <16 x i1> %mc, %md
+  %ret = bitcast <16 x i1> %me to i16
+  ret i16 %ret
+}
+
+define i16 @mand16_mem(<16 x i1>* %x, <16 x i1>* %y) {
+; CHECK-LABEL: mand16_mem:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovw (%rdi), %k0
+; CHECK-NEXT:    kmovw (%rsi), %k1
+; CHECK-NEXT:    kandw %k1, %k0, %k2
+; CHECK-NEXT:    kxorw %k1, %k0, %k0
+; CHECK-NEXT:    korw %k0, %k2, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: mand16_mem:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovw (%rdi), %k0
+; SKX-NEXT:    kmovw (%rsi), %k1
+; SKX-NEXT:    kandw %k1, %k0, %k2
+; SKX-NEXT:    kxorw %k1, %k0, %k0
+; SKX-NEXT:    korw %k0, %k2, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT:    retq
+  %ma = load <16 x i1>, <16 x i1>* %x
+  %mb = load <16 x i1>, <16 x i1>* %y
+  %mc = and <16 x i1> %ma, %mb
+  %md = xor <16 x i1> %ma, %mb
+  %me = or <16 x i1> %mc, %md
+  %ret = bitcast <16 x i1> %me to i16
+  ret i16 %ret
+}
+
+define i8 @shuf_test1(i16 %v) nounwind {
+; CHECK-LABEL: shuf_test1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovd %edi, %k0
+; CHECK-NEXT:    kshiftrw $8, %k0, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: shuf_test1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovd %edi, %k0
+; SKX-NEXT:    kshiftrw $8, %k0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT:    retq
+   %v1 = bitcast i16 %v to <16 x i1>
+   %mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+   %mask1 = bitcast <8 x i1> %mask to i8
+   ret i8 %mask1
+}
+
+define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: zext_test1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kshiftlw $10, %k0, %k0
+; CHECK-NEXT:    kshiftrw $15, %k0, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    andl $1, %eax # sched: [1:0.25]
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_test1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
+; SKX-NEXT:    kshiftlw $10, %k0, %k0
+; SKX-NEXT:    kshiftrw $15, %k0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    andl $1, %eax
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %cmp_res = icmp ugt <16 x i32> %a, %b
+  %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
+  %res = zext i1 %cmp_res.i1 to i32
+  ret i32 %res
+}
+
+define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: zext_test2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kshiftlw $10, %k0, %k0
+; CHECK-NEXT:    kshiftrw $15, %k0, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    andl $1, %eax # sched: [1:0.25]
+; CHECK-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_test2:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
+; SKX-NEXT:    kshiftlw $10, %k0, %k0
+; SKX-NEXT:    kshiftrw $15, %k0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    andl $1, %eax
+; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %cmp_res = icmp ugt <16 x i32> %a, %b
+  %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
+  %res = zext i1 %cmp_res.i1 to i16
+  ret i16 %res
+}
+
+define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
+; CHECK-LABEL: zext_test3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kshiftlw $10, %k0, %k0
+; CHECK-NEXT:    kshiftrw $15, %k0, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    andb $1, %al # sched: [1:0.25]
+; CHECK-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: zext_test3:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
+; SKX-NEXT:    kshiftlw $10, %k0, %k0
+; SKX-NEXT:    kshiftrw $15, %k0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    andb $1, %al
+; SKX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %cmp_res = icmp ugt <16 x i32> %a, %b
+  %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
+  %res = zext i1 %cmp_res.i1 to i8
+  ret i8 %res
+}
+
+define i8 @conv1(<8 x i1>* %R) {
+; CHECK-LABEL: conv1:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    kxnorw %k0, %k0, %k0
+; CHECK-NEXT:    kmovb %k0, (%rdi)
+; CHECK-NEXT:    movb $-2, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
+; CHECK-NEXT:    movb $-2, %al # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: conv1:
+; SKX:       ## BB#0: ## %entry
+; SKX-NEXT:    kxnorw %k0, %k0, %k0
+; SKX-NEXT:    kmovb %k0, (%rdi)
+; SKX-NEXT:    movb $-2, -{{[0-9]+}}(%rsp)
+; SKX-NEXT:    movb $-2, %al
+; SKX-NEXT:    retq
+entry:
+  store <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1>* %R
+
+  %maskPtr = alloca <8 x i1>
+  store <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1>* %maskPtr
+  %mask = load <8 x i1>, <8 x i1>* %maskPtr
+  %mask_convert = bitcast <8 x i1> %mask to i8
+  ret i8 %mask_convert
+}
+
+define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1) {
+; CHECK-LABEL: test4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmpgtq %ymm1, %ymm0, %k0
+; CHECK-NEXT:    vpcmpgtq %ymm3, %ymm2, %k1
+; CHECK-NEXT:    kandnw %k0, %k1, %k0
+; CHECK-NEXT:    vpmovm2d %k0, %xmm0
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test4:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpcmpgtq %ymm1, %ymm0, %k0
+; SKX-NEXT:    vpcmpgtq %ymm3, %ymm2, %k1
+; SKX-NEXT:    kandnw %k0, %k1, %k0
+; SKX-NEXT:    vpmovm2d %k0, %xmm0
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %x_gt_y = icmp sgt <4 x i64> %x, %y
+  %x1_gt_y1 = icmp sgt <4 x i64> %x1, %y1
+  %res = icmp sgt <4 x i1>%x_gt_y, %x1_gt_y1
+  %resse = sext <4 x i1>%res to <4 x i32>
+  ret <4 x i32> %resse
+}
+
+define <2 x i64> @vcmp_test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1) {
+; CHECK-LABEL: vcmp_test5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmpgtq %xmm0, %xmm1, %k0
+; CHECK-NEXT:    vpcmpgtq %xmm3, %xmm2, %k1
+; CHECK-NEXT:    kandnw %k1, %k0, %k0
+; CHECK-NEXT:    vpmovm2q %k0, %xmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: vcmp_test5:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpcmpgtq %xmm0, %xmm1, %k0
+; SKX-NEXT:    vpcmpgtq %xmm3, %xmm2, %k1
+; SKX-NEXT:    kandnw %k1, %k0, %k0
+; SKX-NEXT:    vpmovm2q %k0, %xmm0
+; SKX-NEXT:    retq
+  %x_gt_y = icmp slt <2 x i64> %x, %y
+  %x1_gt_y1 = icmp sgt <2 x i64> %x1, %y1
+  %res = icmp slt <2 x i1>%x_gt_y, %x1_gt_y1
+  %resse = sext <2 x i1>%res to <2 x i64>
+  ret <2 x i64> %resse
+}define void @vcmp_test6(<16 x i1> %mask)  {
+allocas:
+  %a= and <16 x i1> %mask, <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
+  %b = bitcast <16 x i1> %a to i16
+  %c = icmp eq i16 %b, 0
+  br i1 %c, label %true, label %false
+
+true:
+  ret void
+
+false:
+  ret void
+}
+define void @vcmp_test7(<8 x i1> %mask)  {
+; CHECK-LABEL: vcmp_test7:
+; CHECK:       # BB#0: # %allocas
+; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovw2m %xmm0, %k0
+; CHECK-NEXT:    movb $85, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    korb %k1, %k0, %k0
+; CHECK-NEXT:    ktestb %k0, %k0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: vcmp_test7:
+; SKX:       ## BB#0: ## %allocas
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k0
+; SKX-NEXT:    movb $85, %al
+; SKX-NEXT:    kmovd %eax, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    ktestb %k0, %k0
+; SKX-NEXT:    retq
+allocas:
+  %a= or <8 x i1> %mask, <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
+  %b = bitcast <8 x i1> %a to i8
+  %c = icmp eq i8 %b, 0
+  br i1 %c, label %true, label %false
+
+true:
+  ret void
+
+false:
+  ret void
+}
+define <16 x i8> @vcmp_test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
+; CHECK-LABEL: vcmp_test8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.50]
+; CHECK-NEXT:    cmpl %esi, %edi # sched: [1:0.25]
+; CHECK-NEXT:    jg .LBB386_1 # sched: [1:1.00]
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    vpcmpltud %zmm2, %zmm1, %k0
+; CHECK-NEXT:    vpmovm2b %k0, %xmm0
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:  .LBB386_1:
+; CHECK-NEXT:    vpcmpgtd %zmm2, %zmm0, %k0
+; CHECK-NEXT:    vpmovm2b %k0, %xmm0
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: vcmp_test8:
+; SKX:       ## BB#0:
+; SKX-NEXT:    cmpl %esi, %edi
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; SKX-NEXT:    jg LBB17_1
+; SKX-NEXT:  ## BB#2:
+; SKX-NEXT:    vpcmpltud %zmm2, %zmm1, %k0
+; SKX-NEXT:    vpmovm2b %k0, %xmm0
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+; SKX-NEXT:  LBB17_1:
+; SKX-NEXT:    vpcmpgtd %zmm2, %zmm0, %k0
+; SKX-NEXT:    vpmovm2b %k0, %xmm0
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %cond = icmp sgt i32 %a1, %b1
+  %cmp1 = icmp sgt <16 x i32> %a, zeroinitializer
+  %cmp2 = icmp ult <16 x i32> %b, zeroinitializer
+  %mix = select i1 %cond, <16 x i1> %cmp1, <16 x i1> %cmp2
+  %res = sext <16 x i1> %mix to <16 x i8>
+  ret <16 x i8> %res
+}
+define <16 x i1> @vpmov_test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) {
+; CHECK-LABEL: vpmov_test9:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cmpl %esi, %edi # sched: [1:0.25]
+; CHECK-NEXT:    jg .LBB387_1 # sched: [1:1.00]
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    vpsllw $7, %xmm1, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    jmp .LBB387_3 # sched: [1:1.00]
+; CHECK-NEXT:  .LBB387_1:
+; CHECK-NEXT:    vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:  .LBB387_3:
+; CHECK-NEXT:    vpmovb2m %xmm0, %k0
+; CHECK-NEXT:    vpmovm2b %k0, %xmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: vpmov_test9:
+; SKX:       ## BB#0:
+; SKX-NEXT:    cmpl %esi, %edi
+; SKX-NEXT:    jg LBB18_1
+; SKX-NEXT:  ## BB#2:
+; SKX-NEXT:    vpsllw $7, %xmm1, %xmm0
+; SKX-NEXT:    jmp LBB18_3
+; SKX-NEXT:  LBB18_1:
+; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT:  LBB18_3:
+; SKX-NEXT:    vpmovb2m %xmm0, %k0
+; SKX-NEXT:    vpmovm2b %k0, %xmm0
+; SKX-NEXT:    retq
+  %mask = icmp sgt i32 %a1, %b1
+  %c = select i1 %mask, <16 x i1>%a, <16 x i1>%b
+  ret <16 x i1>%c
+}define <8 x i1> @vpmov_test10(<8 x i1>%a, <8 x i1>%b, i32 %a1, i32 %b1) {
+  %mask = icmp sgt i32 %a1, %b1
+  %c = select i1 %mask, <8 x i1>%a, <8 x i1>%b
+  ret <8 x i1>%c
+}
+
+define <4 x i1> @vmov_test11(<4 x i1>%a, <4 x i1>%b, i32 %a1, i32 %b1) {
+; CHECK-LABEL: vmov_test11:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cmpl %esi, %edi # sched: [1:0.25]
+; CHECK-NEXT:    jg .LBB389_1 # sched: [1:1.00]
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    vpslld $31, %xmm1, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    jmp .LBB389_3 # sched: [1:1.00]
+; CHECK-NEXT:  .LBB389_1:
+; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:  .LBB389_3:
+; CHECK-NEXT:    vptestmd %xmm0, %xmm0, %k0
+; CHECK-NEXT:    vpmovm2d %k0, %xmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: vmov_test11:
+; SKX:       ## BB#0:
+; SKX-NEXT:    cmpl %esi, %edi
+; SKX-NEXT:    jg LBB20_1
+; SKX-NEXT:  ## BB#2:
+; SKX-NEXT:    vpslld $31, %xmm1, %xmm0
+; SKX-NEXT:    jmp LBB20_3
+; SKX-NEXT:  LBB20_1:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:  LBB20_3:
+; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k0
+; SKX-NEXT:    vpmovm2d %k0, %xmm0
+; SKX-NEXT:    retq
+  %mask = icmp sgt i32 %a1, %b1
+  %c = select i1 %mask, <4 x i1>%a, <4 x i1>%b
+  ret <4 x i1>%c
+}
+
+define i32 @vmov_test12(i32 %x, i32 %y)  {
+; CHECK-LABEL: vmov_test12:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a = bitcast i16 21845 to <16 x i1>
+  %b = extractelement <16 x i1> %a, i32 0
+  %c = select i1 %b, i32 %x, i32 %y
+  ret i32 %c
+}
+
+define i32 @vmov_test13(i32 %x, i32 %y)  {
+; CHECK-LABEL: vmov_test13:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl %esi, %eax # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a = bitcast i16 21845 to <16 x i1>
+  %b = extractelement <16 x i1> %a, i32 3
+  %c = select i1 %b, i32 %x, i32 %y
+  ret i32 %c
+}define <4 x i1> @vmov_test14()  {
+  %a = bitcast i16 21845 to <16 x i1>
+  %b = extractelement <16 x i1> %a, i32 2
+  %c = insertelement <4 x i1> <i1 true, i1 false, i1 false, i1 true>, i1 %b, i32 1
+  ret <4 x i1> %c
+}
+
+define <16 x i1> @vmov_test15(i32 %x, i32 %y)  {
+; CHECK-LABEL: vmov_test15:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cmpl %esi, %edi # sched: [1:0.25]
+; CHECK-NEXT:    movw $21845, %ax # imm = 0x5555
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    movw $1, %cx # sched: [1:0.25]
+; CHECK-NEXT:    cmovgw %ax, %cx # sched: [1:1.00]
+; CHECK-NEXT:    kmovd %ecx, %k0
+; CHECK-NEXT:    vpmovm2b %k0, %xmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: vmov_test15:
+; SKX:       ## BB#0:
+; SKX-NEXT:    cmpl %esi, %edi
+; SKX-NEXT:    movw $21845, %ax ## imm = 0x5555
+; SKX-NEXT:    movw $1, %cx
+; SKX-NEXT:    cmovgw %ax, %cx
+; SKX-NEXT:    kmovd %ecx, %k0
+; SKX-NEXT:    vpmovm2b %k0, %xmm0
+; SKX-NEXT:    retq
+  %a = bitcast i16 21845 to <16 x i1>
+  %b = bitcast i16 1 to <16 x i1>
+  %mask = icmp sgt i32 %x, %y
+  %c = select i1 %mask, <16 x i1> %a, <16 x i1> %b
+  ret <16 x i1> %c
+}
+
+define <64 x i8> @vmov_test16(i64 %x) {
+;
+; CHECK-LABEL: vmov_test16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovq %rdi, %k0
+; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpmovm2b %k1, %zmm0
+; CHECK-NEXT:    vpsllq $40, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovm2b %k0, %zmm1
+; CHECK-NEXT:    movl $32, %eax # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; CHECK-NEXT:    vpmovb2m %zmm0, %k0
+; CHECK-NEXT:    vpmovm2b %k0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: vmov_test16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovq %rdi, %k0
+; SKX-NEXT:    movb $1, %al
+; SKX-NEXT:    kmovd %eax, %k1
+; SKX-NEXT:    vpmovm2b %k1, %zmm0
+; SKX-NEXT:    vpsllq $40, %xmm0, %xmm0
+; SKX-NEXT:    vpmovm2b %k0, %zmm1
+; SKX-NEXT:    movl $32, %eax
+; SKX-NEXT:    kmovd %eax, %k1
+; SKX-NEXT:    vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; SKX-NEXT:    vpmovb2m %zmm0, %k0
+; SKX-NEXT:    vpmovm2b %k0, %zmm0
+; SKX-NEXT:    retq
+  %a = bitcast i64 %x to <64 x i1>
+  %b = insertelement <64 x i1>%a, i1 true, i32 5
+  %c = sext <64 x i1>%b to <64 x i8>
+  ret <64 x i8>%c
+}
+
+define <64 x i8> @vmov_test17(i64 %x, i32 %y, i32 %z) {
+;
+; CHECK-LABEL: vmov_test17:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovq %rdi, %k0
+; CHECK-NEXT:    cmpl %edx, %esi # sched: [1:0.25]
+; CHECK-NEXT:    setg %al # sched: [1:1.00]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpmovm2b %k1, %zmm0
+; CHECK-NEXT:    vpsllq $40, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovm2b %k0, %zmm1
+; CHECK-NEXT:    movl $32, %eax # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; CHECK-NEXT:    vpmovb2m %zmm0, %k0
+; CHECK-NEXT:    vpmovm2b %k0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: vmov_test17:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovq %rdi, %k0
+; SKX-NEXT:    cmpl %edx, %esi
+; SKX-NEXT:    setg %al
+; SKX-NEXT:    kmovd %eax, %k1
+; SKX-NEXT:    vpmovm2b %k1, %zmm0
+; SKX-NEXT:    vpsllq $40, %xmm0, %xmm0
+; SKX-NEXT:    vpmovm2b %k0, %zmm1
+; SKX-NEXT:    movl $32, %eax
+; SKX-NEXT:    kmovd %eax, %k1
+; SKX-NEXT:    vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; SKX-NEXT:    vpmovb2m %zmm0, %k0
+; SKX-NEXT:    vpmovm2b %k0, %zmm0
+; SKX-NEXT:    retq
+  %a = bitcast i64 %x to <64 x i1>
+  %b = icmp sgt i32 %y, %z
+  %c = insertelement <64 x i1>%a, i1 %b, i32 5
+  %d = sext <64 x i1>%c to <64 x i8>
+  ret <64 x i8>%d
+}
+
+define <8 x i1> @vmov_test18(i8 %a, i16 %y) {
+; CHECK-LABEL: vmov_test18:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovd %edi, %k0
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    kshiftlw $7, %k1, %k2
+; CHECK-NEXT:    kshiftrw $15, %k2, %k2
+; CHECK-NEXT:    kmovd %k2, %eax
+; CHECK-NEXT:    kshiftlw $6, %k1, %k1
+; CHECK-NEXT:    kshiftrw $15, %k1, %k1
+; CHECK-NEXT:    kmovd %k1, %ecx
+; CHECK-NEXT:    vpmovm2q %k0, %zmm0
+; CHECK-NEXT:    kmovd %ecx, %k0
+; CHECK-NEXT:    vpmovm2q %k0, %zmm1
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] sched: [5:0.50]
+; CHECK-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; CHECK-NEXT:    vpmovq2m %zmm2, %k0
+; CHECK-NEXT:    kshiftlb $1, %k0, %k0
+; CHECK-NEXT:    kshiftrb $1, %k0, %k0
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kshiftlb $7, %k1, %k1
+; CHECK-NEXT:    korb %k1, %k0, %k0
+; CHECK-NEXT:    vpmovm2w %k0, %xmm0
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: vmov_test18:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovd %edi, %k0
+; SKX-NEXT:    kmovd %esi, %k1
+; SKX-NEXT:    kshiftlw $7, %k1, %k2
+; SKX-NEXT:    kshiftrw $15, %k2, %k2
+; SKX-NEXT:    kmovd %k2, %eax
+; SKX-NEXT:    kshiftlw $6, %k1, %k1
+; SKX-NEXT:    kshiftrw $15, %k1, %k1
+; SKX-NEXT:    kmovd %k1, %ecx
+; SKX-NEXT:    vpmovm2q %k0, %zmm0
+; SKX-NEXT:    kmovd %ecx, %k0
+; SKX-NEXT:    vpmovm2q %k0, %zmm1
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7]
+; SKX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; SKX-NEXT:    vpmovq2m %zmm2, %k0
+; SKX-NEXT:    kshiftlb $1, %k0, %k0
+; SKX-NEXT:    kshiftrb $1, %k0, %k0
+; SKX-NEXT:    kmovd %eax, %k1
+; SKX-NEXT:    kshiftlb $7, %k1, %k1
+; SKX-NEXT:    korb %k1, %k0, %k0
+; SKX-NEXT:    vpmovm2w %k0, %xmm0
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %b = bitcast i8 %a to <8 x i1>
+  %b1 = bitcast i16 %y to <16 x i1>
+  %el1 = extractelement <16 x i1>%b1, i32 8
+  %el2 = extractelement <16 x i1>%b1, i32 9
+  %c = insertelement <8 x i1>%b, i1 %el1, i32 7
+  %d = insertelement <8 x i1>%c, i1 %el2, i32 6
+  ret <8 x i1>%d
+}
+define <32 x i16> @vmov_test21(<32 x i16> %x , <32 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: vmov_test21:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $7, %ymm1, %ymm1 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovb2m %ymm1, %k1
+; CHECK-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: vmov_test21:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %ymm1, %ymm1
+; SKX-NEXT:    vpmovb2m %ymm1, %k1
+; SKX-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
+  ret <32 x i16> %ret
+}
+
+define void @vmov_test22(<4 x i1> %a, <4 x i1>* %addr) {
+; CHECK-LABEL: vmov_test22:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vptestmd %xmm0, %xmm0, %k0
+; CHECK-NEXT:    kmovb %k0, (%rdi)
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: vmov_test22:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k0
+; SKX-NEXT:    kmovb %k0, (%rdi)
+; SKX-NEXT:    retq
+  store <4 x i1> %a, <4 x i1>* %addr
+  ret void
+}
+
+define void @vmov_test23(<2 x i1> %a, <2 x i1>* %addr) {
+; CHECK-LABEL: vmov_test23:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vptestmq %xmm0, %xmm0, %k0
+; CHECK-NEXT:    kmovb %k0, (%rdi)
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: vmov_test23:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllq $63, %xmm0, %xmm0
+; SKX-NEXT:    vptestmq %xmm0, %xmm0, %k0
+; SKX-NEXT:    kmovb %k0, (%rdi)
+; SKX-NEXT:    retq
+  store <2 x i1> %a, <2 x i1>* %addr
+  ret void
+}
+
+define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) {
+; CHECK-LABEL: store_v1i1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovd %edi, %k0
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kxorw %k1, %k0, %k0
+; CHECK-NEXT:    kmovb %k0, (%rsi)
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: store_v1i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovd %edi, %k0
+; SKX-NEXT:    kxnorw %k0, %k0, %k1
+; SKX-NEXT:    kxorw %k1, %k0, %k0
+; SKX-NEXT:    kmovb %k0, (%rsi)
+; SKX-NEXT:    retq
+  %x = xor <1 x i1> %c, <i1 1>
+  store <1 x i1> %x, <1 x i1>*  %ptr, align 4
+  ret void
+}
+
+define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) {
+; CHECK-LABEL: store_v2i1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vptestmq %xmm0, %xmm0, %k0
+; CHECK-NEXT:    knotw %k0, %k0
+; CHECK-NEXT:    kmovb %k0, (%rdi)
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: store_v2i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllq $63, %xmm0, %xmm0
+; SKX-NEXT:    vptestmq %xmm0, %xmm0, %k0
+; SKX-NEXT:    knotw %k0, %k0
+; SKX-NEXT:    kmovb %k0, (%rdi)
+; SKX-NEXT:    retq
+  %x = xor <2 x i1> %c, <i1 1, i1 1>
+  store <2 x i1> %x, <2 x i1>*  %ptr, align 4
+  ret void
+}
+
+define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) {
+; CHECK-LABEL: store_v4i1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vptestmd %xmm0, %xmm0, %k0
+; CHECK-NEXT:    knotw %k0, %k0
+; CHECK-NEXT:    kmovb %k0, (%rdi)
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: store_v4i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpslld $31, %xmm0, %xmm0
+; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k0
+; SKX-NEXT:    knotw %k0, %k0
+; SKX-NEXT:    kmovb %k0, (%rdi)
+; SKX-NEXT:    retq
+  %x = xor <4 x i1> %c, <i1 1, i1 1, i1 1, i1 1>
+  store <4 x i1> %x, <4 x i1>*  %ptr, align 4
+  ret void
+}
+
+define void @store_v8i1(<8 x i1> %c , <8 x i1>* %ptr) {
+; CHECK-LABEL: store_v8i1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovw2m %xmm0, %k0
+; CHECK-NEXT:    knotb %k0, %k0
+; CHECK-NEXT:    kmovb %k0, (%rdi)
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: store_v8i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k0
+; SKX-NEXT:    knotb %k0, %k0
+; SKX-NEXT:    kmovb %k0, (%rdi)
+; SKX-NEXT:    retq
+  %x = xor <8 x i1> %c, <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>
+  store <8 x i1> %x, <8 x i1>*  %ptr, align 4
+  ret void
+}
+
+define void @store_v16i1(<16 x i1> %c , <16 x i1>* %ptr) {
+; CHECK-LABEL: store_v16i1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovb2m %xmm0, %k0
+; CHECK-NEXT:    knotw %k0, %k0
+; CHECK-NEXT:    kmovw %k0, (%rdi)
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: store_v16i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT:    vpmovb2m %xmm0, %k0
+; SKX-NEXT:    knotw %k0, %k0
+; SKX-NEXT:    kmovw %k0, (%rdi)
+; SKX-NEXT:    retq
+  %x = xor <16 x i1> %c, <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>
+  store <16 x i1> %x, <16 x i1>*  %ptr, align 4
+  ret void
+}
+
+;void f2(int);
+;void f1(int c)
+;{
+;  static int v = 0;
+;  if (v == 0)
+;    v = 1;
+;  else
+;    v = 0;
+;  f2(v);
+;}
+
+ at f1.v = internal unnamed_addr global i1 false, align 4
+
+define void @f1(i32 %c) {
+; CHECK-LABEL: f1:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movzbl {{.*}}(%rip), %edi # sched: [1:0.50]
+; CHECK-NEXT:    xorl $1, %edi # sched: [1:0.25]
+; CHECK-NEXT:    movb %dil, {{.*}}(%rip) # sched: [1:1.00]
+; CHECK-NEXT:    jmp f2 # TAILCALL
+entry:
+  %.b1 = load i1, i1* @f1.v, align 4
+  %not..b1 = xor i1 %.b1, true
+  store i1 %not..b1, i1* @f1.v, align 4
+  %0 = zext i1 %not..b1 to i32
+  tail call void @f2(i32 %0) #2
+  ret void
+}
+
+declare void @f2(i32) #1
+
+define void @store_i16_i1(i16 %x, i1 *%y) {
+; CHECK-LABEL: store_i16_i1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andl $1, %edi # sched: [1:0.25]
+; CHECK-NEXT:    movb %dil, (%rsi) # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %c = trunc i16 %x to i1
+  store i1 %c, i1* %y
+  ret void
+}
+
+define void @store_i8_i1(i8 %x, i1 *%y) {
+; CHECK-LABEL: store_i8_i1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andl $1, %edi # sched: [1:0.25]
+; CHECK-NEXT:    movb %dil, (%rsi) # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %c = trunc i8 %x to i1
+  store i1 %c, i1* %y
+  ret void
+}
+
+define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) {
+; CHECK-LABEL: test_build_vec_v32i1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $1497715861, %eax # imm = 0x59455495
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_build_vec_v32i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    movl $1497715861, %eax ## imm = 0x59455495
+; SKX-NEXT:    kmovd %eax, %k1
+; SKX-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %ret = select <32 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false>, <32 x i16> %x, <32 x i16> zeroinitializer
+  ret <32 x i16> %ret
+}
+
+define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) {
+; CHECK-LABEL: test_build_vec_v64i1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movabsq $6432645796886517060, %rax # imm = 0x5945594549549544
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovq %rax, %k1
+; CHECK-NEXT:    vmovdqu8 %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_build_vec_v64i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    movabsq $6432645796886517060, %rax ## imm = 0x5945594549549544
+; SKX-NEXT:    kmovq %rax, %k1
+; SKX-NEXT:    vmovdqu8 %zmm0, %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+  %ret = select <64 x i1> <i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false>, <64 x i8> %x, <64 x i8> zeroinitializer
+  ret <64 x i8> %ret
+}
+
+define void @ktest_1(<8 x double> %in, double * %base) {
+; CHECK-LABEL: ktest_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovupd (%rdi), %zmm1 # sched: [5:0.50]
+; CHECK-NEXT:    vcmpltpd %zmm0, %zmm1, %k1
+; CHECK-NEXT:    vmovupd 8(%rdi), %zmm1 {%k1} {z} # sched: [5:0.50]
+; CHECK-NEXT:    vcmpltpd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    ktestb %k0, %k0
+; CHECK-NEXT:    je .LBB410_2 # sched: [1:1.00]
+; CHECK-NEXT:  # BB#1: # %L1
+; CHECK-NEXT:    vmovapd %zmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:  .LBB410_2: # %L2
+; CHECK-NEXT:    vmovapd %zmm0, 8(%rdi)
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: ktest_1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vmovupd (%rdi), %zmm1
+; SKX-NEXT:    vcmpltpd %zmm0, %zmm1, %k1
+; SKX-NEXT:    vmovupd 8(%rdi), %zmm1 {%k1} {z}
+; SKX-NEXT:    vcmpltpd %zmm1, %zmm0, %k0 {%k1}
+; SKX-NEXT:    ktestb %k0, %k0
+; SKX-NEXT:    je LBB41_2
+; SKX-NEXT:  ## BB#1: ## %L1
+; SKX-NEXT:    vmovapd %zmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+; SKX-NEXT:  LBB41_2: ## %L2
+; SKX-NEXT:    vmovapd %zmm0, 8(%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %addr1 = getelementptr double, double * %base, i64 0
+  %addr2 = getelementptr double, double * %base, i64 1
+
+  %vaddr1 = bitcast double* %addr1 to <8 x double>*
+  %vaddr2 = bitcast double* %addr2 to <8 x double>*
+
+  %val1 = load <8 x double>, <8 x double> *%vaddr1, align 1
+  %val2 = load <8 x double>, <8 x double> *%vaddr2, align 1
+
+  %sel1 = fcmp ogt <8 x double>%in, %val1
+  %val3 = select <8 x i1> %sel1, <8 x double> %val2, <8 x double> zeroinitializer
+  %sel2 = fcmp olt <8 x double> %in, %val3
+  %sel3 = and <8 x i1> %sel1, %sel2
+
+  %int_sel3 = bitcast <8 x i1> %sel3 to i8
+  %res = icmp eq i8 %int_sel3, zeroinitializer
+  br i1 %res, label %L2, label %L1
+L1:
+  store <8 x double> %in, <8 x double>* %vaddr1
+  br label %End
+L2:
+  store <8 x double> %in, <8 x double>* %vaddr2
+  br label %End
+End:
+  ret void
+}
+
+define void @ktest_2(<32 x float> %in, float * %base) {
+;
+; CHECK-LABEL: ktest_2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovups (%rdi), %zmm2 # sched: [5:0.50]
+; CHECK-NEXT:    vmovups 64(%rdi), %zmm3 # sched: [5:0.50]
+; CHECK-NEXT:    vcmpltps %zmm0, %zmm2, %k1
+; CHECK-NEXT:    vcmpltps %zmm1, %zmm3, %k2
+; CHECK-NEXT:    kunpckwd %k1, %k2, %k0
+; CHECK-NEXT:    vmovups 68(%rdi), %zmm2 {%k2} {z} # sched: [5:0.50]
+; CHECK-NEXT:    vmovups 4(%rdi), %zmm3 {%k1} {z} # sched: [5:0.50]
+; CHECK-NEXT:    vcmpltps %zmm3, %zmm0, %k1
+; CHECK-NEXT:    vcmpltps %zmm2, %zmm1, %k2
+; CHECK-NEXT:    kunpckwd %k1, %k2, %k1
+; CHECK-NEXT:    kord %k1, %k0, %k0
+; CHECK-NEXT:    ktestd %k0, %k0
+; CHECK-NEXT:    je .LBB411_2 # sched: [1:1.00]
+; CHECK-NEXT:  # BB#1: # %L1
+; CHECK-NEXT:    vmovaps %zmm0, (%rdi)
+; CHECK-NEXT:    vmovaps %zmm1, 64(%rdi)
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; CHECK-NEXT:  .LBB411_2: # %L2
+; CHECK-NEXT:    vmovaps %zmm0, 4(%rdi)
+; CHECK-NEXT:    vmovaps %zmm1, 68(%rdi)
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: ktest_2:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vmovups (%rdi), %zmm2
+; SKX-NEXT:    vmovups 64(%rdi), %zmm3
+; SKX-NEXT:    vcmpltps %zmm0, %zmm2, %k1
+; SKX-NEXT:    vcmpltps %zmm1, %zmm3, %k2
+; SKX-NEXT:    kunpckwd %k1, %k2, %k0
+; SKX-NEXT:    vmovups 68(%rdi), %zmm2 {%k2} {z}
+; SKX-NEXT:    vmovups 4(%rdi), %zmm3 {%k1} {z}
+; SKX-NEXT:    vcmpltps %zmm3, %zmm0, %k1
+; SKX-NEXT:    vcmpltps %zmm2, %zmm1, %k2
+; SKX-NEXT:    kunpckwd %k1, %k2, %k1
+; SKX-NEXT:    kord %k1, %k0, %k0
+; SKX-NEXT:    ktestd %k0, %k0
+; SKX-NEXT:    je LBB42_2
+; SKX-NEXT:  ## BB#1: ## %L1
+; SKX-NEXT:    vmovaps %zmm0, (%rdi)
+; SKX-NEXT:    vmovaps %zmm1, 64(%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+; SKX-NEXT:  LBB42_2: ## %L2
+; SKX-NEXT:    vmovaps %zmm0, 4(%rdi)
+; SKX-NEXT:    vmovaps %zmm1, 68(%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %addr1 = getelementptr float, float * %base, i64 0
+  %addr2 = getelementptr float, float * %base, i64 1
+
+  %vaddr1 = bitcast float* %addr1 to <32 x float>*
+  %vaddr2 = bitcast float* %addr2 to <32 x float>*
+
+  %val1 = load <32 x float>, <32 x float> *%vaddr1, align 1
+  %val2 = load <32 x float>, <32 x float> *%vaddr2, align 1
+
+  %sel1 = fcmp ogt <32 x float>%in, %val1
+  %val3 = select <32 x i1> %sel1, <32 x float> %val2, <32 x float> zeroinitializer
+  %sel2 = fcmp olt <32 x float> %in, %val3
+  %sel3 = or <32 x i1> %sel1, %sel2
+
+  %int_sel3 = bitcast <32 x i1> %sel3 to i32
+  %res = icmp eq i32 %int_sel3, zeroinitializer
+  br i1 %res, label %L2, label %L1
+L1:
+  store <32 x float> %in, <32 x float>* %vaddr1
+  br label %End
+L2:
+  store <32 x float> %in, <32 x float>* %vaddr2
+  br label %End
+End:
+  ret void
+}
+
+define <8 x i64> @load_8i1(<8 x i1>* %a) {
+; CHECK-LABEL: load_8i1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb (%rdi), %k0
+; CHECK-NEXT:    vpmovm2q %k0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: load_8i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovb (%rdi), %k0
+; SKX-NEXT:    vpmovm2q %k0, %zmm0
+; SKX-NEXT:    retq
+  %b = load <8 x i1>, <8 x i1>* %a
+  %c = sext <8 x i1> %b to <8 x i64>
+  ret <8 x i64> %c
+}
+
+define <16 x i32> @load_16i1(<16 x i1>* %a) {
+; CHECK-LABEL: load_16i1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovw (%rdi), %k0
+; CHECK-NEXT:    vpmovm2d %k0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: load_16i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovw (%rdi), %k0
+; SKX-NEXT:    vpmovm2d %k0, %zmm0
+; SKX-NEXT:    retq
+  %b = load <16 x i1>, <16 x i1>* %a
+  %c = sext <16 x i1> %b to <16 x i32>
+  ret <16 x i32> %c
+}
+
+define <2 x i16> @load_2i1(<2 x i1>* %a) {
+; CHECK-LABEL: load_2i1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb (%rdi), %k0
+; CHECK-NEXT:    vpmovm2q %k0, %xmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: load_2i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovb (%rdi), %k0
+; SKX-NEXT:    vpmovm2q %k0, %xmm0
+; SKX-NEXT:    retq
+  %b = load <2 x i1>, <2 x i1>* %a
+  %c = sext <2 x i1> %b to <2 x i16>
+  ret <2 x i16> %c
+}
+
+define <4 x i16> @load_4i1(<4 x i1>* %a) {
+; CHECK-LABEL: load_4i1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovb (%rdi), %k0
+; CHECK-NEXT:    vpmovm2d %k0, %xmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: load_4i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovb (%rdi), %k0
+; SKX-NEXT:    vpmovm2d %k0, %xmm0
+; SKX-NEXT:    retq
+  %b = load <4 x i1>, <4 x i1>* %a
+  %c = sext <4 x i1> %b to <4 x i16>
+  ret <4 x i16> %c
+}
+
+define <32 x i16> @load_32i1(<32 x i1>* %a) {
+; CHECK-LABEL: load_32i1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovd (%rdi), %k0
+; CHECK-NEXT:    vpmovm2w %k0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: load_32i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovd (%rdi), %k0
+; SKX-NEXT:    vpmovm2w %k0, %zmm0
+; SKX-NEXT:    retq
+  %b = load <32 x i1>, <32 x i1>* %a
+  %c = sext <32 x i1> %b to <32 x i16>
+  ret <32 x i16> %c
+}
+
+define <64 x i8> @load_64i1(<64 x i1>* %a) {
+; CHECK-LABEL: load_64i1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovq (%rdi), %k0
+; CHECK-NEXT:    vpmovm2b %k0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: load_64i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovq (%rdi), %k0
+; SKX-NEXT:    vpmovm2b %k0, %zmm0
+; SKX-NEXT:    retq
+  %b = load <64 x i1>, <64 x i1>* %a
+  %c = sext <64 x i1> %b to <64 x i8>
+  ret <64 x i8> %c
+}
+
+define void @store_8i1(<8 x i1>* %a, <8 x i1> %v) {
+; CHECK-LABEL: store_8i1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovw2m %xmm0, %k0
+; CHECK-NEXT:    kmovb %k0, (%rdi)
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: store_8i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k0
+; SKX-NEXT:    kmovb %k0, (%rdi)
+; SKX-NEXT:    retq
+  store <8 x i1> %v, <8 x i1>* %a
+  ret void
+}
+
+define void @store_8i1_1(<8 x i1>* %a, <8 x i16> %v) {
+; CHECK-LABEL: store_8i1_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovw2m %xmm0, %k0
+; CHECK-NEXT:    kmovb %k0, (%rdi)
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: store_8i1_1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
+; SKX-NEXT:    vpmovw2m %xmm0, %k0
+; SKX-NEXT:    kmovb %k0, (%rdi)
+; SKX-NEXT:    retq
+  %v1 = trunc <8 x i16> %v to <8 x i1>
+  store <8 x i1> %v1, <8 x i1>* %a
+  ret void
+}
+
+define void @store_16i1(<16 x i1>* %a, <16 x i1> %v) {
+; CHECK-LABEL: store_16i1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovb2m %xmm0, %k0
+; CHECK-NEXT:    kmovw %k0, (%rdi)
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: store_16i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
+; SKX-NEXT:    vpmovb2m %xmm0, %k0
+; SKX-NEXT:    kmovw %k0, (%rdi)
+; SKX-NEXT:    retq
+  store <16 x i1> %v, <16 x i1>* %a
+  ret void
+}
+
+define void @store_32i1(<32 x i1>* %a, <32 x i1> %v) {
+; CHECK-LABEL: store_32i1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $7, %ymm0, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    vpmovb2m %ymm0, %k0
+; CHECK-NEXT:    kmovd %k0, (%rdi)
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: store_32i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %ymm0, %ymm0
+; SKX-NEXT:    vpmovb2m %ymm0, %k0
+; SKX-NEXT:    kmovd %k0, (%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  store <32 x i1> %v, <32 x i1>* %a
+  ret void
+}
+
+define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) {
+; CHECK-LABEL: store_32i1_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $15, %zmm0, %zmm0
+; CHECK-NEXT:    vpmovw2m %zmm0, %k0
+; CHECK-NEXT:    kmovd %k0, (%rdi)
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: store_32i1_1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $15, %zmm0, %zmm0
+; SKX-NEXT:    vpmovw2m %zmm0, %k0
+; SKX-NEXT:    kmovd %k0, (%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %v1 = trunc <32 x i16> %v to <32 x i1>
+  store <32 x i1> %v1, <32 x i1>* %a
+  ret void
+}
+
+
+define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
+;
+; CHECK-LABEL: store_64i1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsllw $7, %zmm0, %zmm0
+; CHECK-NEXT:    vpmovb2m %zmm0, %k0
+; CHECK-NEXT:    kmovq %k0, (%rdi)
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: store_64i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpsllw $7, %zmm0, %zmm0
+; SKX-NEXT:    vpmovb2m %zmm0, %k0
+; SKX-NEXT:    kmovq %k0, (%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  store <64 x i1> %v, <64 x i1>* %a
+  ret void
+}
+
+define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) {
+; CHECK-LABEL: test_bitcast_v8i1_zext:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovb %k0, %eax
+; CHECK-NEXT:    addl %eax, %eax # sched: [1:0.25]
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_bitcast_v8i1_zext:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; SKX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; SKX-NEXT:    kmovb %k0, %eax
+; SKX-NEXT:    addl %eax, %eax
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+   %v1 = icmp eq <16 x i32> %a, zeroinitializer
+   %mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+   %mask1 = bitcast <8 x i1> %mask to i8
+   %val = zext i8 %mask1 to i32
+   %val1 = add i32 %val, %val
+   ret i32 %val1
+}
+
+define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) {
+; CHECK-LABEL: test_bitcast_v16i1_zext:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    addl %eax, %eax # sched: [1:0.25]
+; CHECK-NEXT:    vzeroupper # sched: [4:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+   %v1 = icmp eq <16 x i32> %a, zeroinitializer
+   %mask1 = bitcast <16 x i1> %v1 to i16
+   %val = zext i16 %mask1 to i32
+   %val1 = add i32 %val, %val
+   ret i32 %val1
+}
+
+define i16 @test_v16i1_add(i16 %x, i16 %y) {
+; CHECK-LABEL: test_v16i1_add:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovd %edi, %k0
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    kxorw %k1, %k0, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_v16i1_add:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovd %edi, %k0
+; SKX-NEXT:    kmovd %esi, %k1
+; SKX-NEXT:    kxorw %k1, %k0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT:    retq
+  %m0 = bitcast i16 %x to <16 x i1>
+  %m1 = bitcast i16 %y to <16 x i1>
+  %m2 = add <16 x i1> %m0,  %m1
+  %ret = bitcast <16 x i1> %m2 to i16
+  ret i16 %ret
+}
+
+define i16 @test_v16i1_sub(i16 %x, i16 %y) {
+; CHECK-LABEL: test_v16i1_sub:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovd %edi, %k0
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    kxorw %k1, %k0, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_v16i1_sub:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovd %edi, %k0
+; SKX-NEXT:    kmovd %esi, %k1
+; SKX-NEXT:    kxorw %k1, %k0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT:    retq
+  %m0 = bitcast i16 %x to <16 x i1>
+  %m1 = bitcast i16 %y to <16 x i1>
+  %m2 = sub <16 x i1> %m0,  %m1
+  %ret = bitcast <16 x i1> %m2 to i16
+  ret i16 %ret
+}
+
+define i16 @test_v16i1_mul(i16 %x, i16 %y) {
+; CHECK-LABEL: test_v16i1_mul:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovd %edi, %k0
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    kandw %k1, %k0, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_v16i1_mul:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovd %edi, %k0
+; SKX-NEXT:    kmovd %esi, %k1
+; SKX-NEXT:    kandw %k1, %k0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT:    retq
+  %m0 = bitcast i16 %x to <16 x i1>
+  %m1 = bitcast i16 %y to <16 x i1>
+  %m2 = mul <16 x i1> %m0,  %m1
+  %ret = bitcast <16 x i1> %m2 to i16
+  ret i16 %ret
+}
+
+define i8 @test_v8i1_add(i8 %x, i8 %y) {
+; CHECK-LABEL: test_v8i1_add:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovd %edi, %k0
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    kxorb %k1, %k0, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_v8i1_add:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovd %edi, %k0
+; SKX-NEXT:    kmovd %esi, %k1
+; SKX-NEXT:    kxorb %k1, %k0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT:    retq
+  %m0 = bitcast i8 %x to <8 x i1>
+  %m1 = bitcast i8 %y to <8 x i1>
+  %m2 = add <8 x i1> %m0,  %m1
+  %ret = bitcast <8 x i1> %m2 to i8
+  ret i8 %ret
+}
+
+define i8 @test_v8i1_sub(i8 %x, i8 %y) {
+; CHECK-LABEL: test_v8i1_sub:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovd %edi, %k0
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    kxorb %k1, %k0, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_v8i1_sub:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovd %edi, %k0
+; SKX-NEXT:    kmovd %esi, %k1
+; SKX-NEXT:    kxorb %k1, %k0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT:    retq
+  %m0 = bitcast i8 %x to <8 x i1>
+  %m1 = bitcast i8 %y to <8 x i1>
+  %m2 = sub <8 x i1> %m0,  %m1
+  %ret = bitcast <8 x i1> %m2 to i8
+  ret i8 %ret
+}
+
+define i8 @test_v8i1_mul(i8 %x, i8 %y) {
+; CHECK-LABEL: test_v8i1_mul:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kmovd %edi, %k0
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    kandb %k1, %k0, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT:    retq # sched: [2:1.00]
+; SKX-LABEL: test_v8i1_mul:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovd %edi, %k0
+; SKX-NEXT:    kmovd %esi, %k1
+; SKX-NEXT:    kandb %k1, %k0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT:    retq
+  %m0 = bitcast i8 %x to <8 x i1>
+  %m1 = bitcast i8 %y to <8 x i1>
+  %m2 = mul <8 x i1> %m0,  %m1
+  %ret = bitcast <8 x i1> %m2 to i8
+  ret i8 %ret
+}
+
+define   <16 x i32> @_inreg16xi32(i32 %a) {
+; CHECK-LABEL: _inreg16xi32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpbroadcastd %edi, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = insertelement <16 x i32> undef, i32 %a, i32 0
+  %c = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
+  ret <16 x i32> %c
+}
+
+define   <8 x i64> @_inreg8xi64(i64 %a) {
+; CHECK-LABEL: _inreg8xi64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpbroadcastq %rdi, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = insertelement <8 x i64> undef, i64 %a, i32 0
+  %c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
+  ret <8 x i64> %c
+}
+
+define   <16 x float> @_ss16xfloat_v4(<4 x float> %a) {
+; CHECK-LABEL: _ss16xfloat_v4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = shufflevector <4 x float> %a, <4 x float> undef, <16 x i32> zeroinitializer
+  ret <16 x float> %b
+}
+
+define   <16 x float> @_inreg16xfloat(float %a) {
+; CHECK-LABEL: _inreg16xfloat:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = insertelement <16 x float> undef, float %a, i32 0
+  %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
+  ret <16 x float> %c
+}
+
+define   <16 x float> @_ss16xfloat_mask(float %a, <16 x float> %i, <16 x i32> %mask1) {
+; CHECK-LABEL: _ss16xfloat_mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqd %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vbroadcastss %xmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %b = insertelement <16 x float> undef, float %a, i32 0
+  %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
+  %r = select <16 x i1> %mask, <16 x float> %c, <16 x float> %i
+  ret <16 x float> %r
+}
+
+define   <16 x float> @_ss16xfloat_maskz(float %a, <16 x i32> %mask1) {
+; CHECK-LABEL: _ss16xfloat_maskz:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %b = insertelement <16 x float> undef, float %a, i32 0
+  %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
+  %r = select <16 x i1> %mask, <16 x float> %c, <16 x float> zeroinitializer
+  ret <16 x float> %r
+}
+
+define   <16 x float> @_ss16xfloat_load(float* %a.ptr) {
+; CHECK-LABEL: _ss16xfloat_load:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vbroadcastss (%rdi), %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a = load float, float* %a.ptr
+  %b = insertelement <16 x float> undef, float %a, i32 0
+  %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
+  ret <16 x float> %c
+}
+
+define   <16 x float> @_ss16xfloat_mask_load(float* %a.ptr, <16 x float> %i, <16 x i32> %mask1) {
+; CHECK-LABEL: _ss16xfloat_mask_load:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vbroadcastss (%rdi), %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a = load float, float* %a.ptr
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %b = insertelement <16 x float> undef, float %a, i32 0
+  %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
+  %r = select <16 x i1> %mask, <16 x float> %c, <16 x float> %i
+  ret <16 x float> %r
+}
+
+define   <16 x float> @_ss16xfloat_maskz_load(float* %a.ptr, <16 x i32> %mask1) {
+; CHECK-LABEL: _ss16xfloat_maskz_load:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vbroadcastss (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a = load float, float* %a.ptr
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %b = insertelement <16 x float> undef, float %a, i32 0
+  %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
+  %r = select <16 x i1> %mask, <16 x float> %c, <16 x float> zeroinitializer
+  ret <16 x float> %r
+}
+
+define   <8 x double> @_inreg8xdouble(double %a) {
+; CHECK-LABEL: _inreg8xdouble:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = insertelement <8 x double> undef, double %a, i32 0
+  %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
+  ret <8 x double> %c
+}
+
+define   <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %mask1) {
+; CHECK-LABEL: _sd8xdouble_mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqd %ymm3, %ymm2, %k1
+; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %b = insertelement <8 x double> undef, double %a, i32 0
+  %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
+  %r = select <8 x i1> %mask, <8 x double> %c, <8 x double> %i
+  ret <8 x double> %r
+}
+
+define   <8 x double> @_sd8xdouble_maskz(double %a, <8 x i32> %mask1) {
+; CHECK-LABEL: _sd8xdouble_maskz:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %b = insertelement <8 x double> undef, double %a, i32 0
+  %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
+  %r = select <8 x i1> %mask, <8 x double> %c, <8 x double> zeroinitializer
+  ret <8 x double> %r
+}
+
+define   <8 x double> @_sd8xdouble_load(double* %a.ptr) {
+; CHECK-LABEL: _sd8xdouble_load:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vbroadcastsd (%rdi), %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a = load double, double* %a.ptr
+  %b = insertelement <8 x double> undef, double %a, i32 0
+  %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
+  ret <8 x double> %c
+}
+
+define   <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8 x i32> %mask1) {
+; CHECK-LABEL: _sd8xdouble_mask_load:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vbroadcastsd (%rdi), %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a = load double, double* %a.ptr
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %b = insertelement <8 x double> undef, double %a, i32 0
+  %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
+  %r = select <8 x i1> %mask, <8 x double> %c, <8 x double> %i
+  ret <8 x double> %r
+}
+
+define   <8 x double> @_sd8xdouble_maskz_load(double* %a.ptr, <8 x i32> %mask1) {
+; CHECK-LABEL: _sd8xdouble_maskz_load:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    vpcmpneqd %ymm1, %ymm0, %k1
+; CHECK-NEXT:    vbroadcastsd (%rdi), %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a = load double, double* %a.ptr
+  %mask = icmp ne <8 x i32> %mask1, zeroinitializer
+  %b = insertelement <8 x double> undef, double %a, i32 0
+  %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
+  %r = select <8 x i1> %mask, <8 x double> %c, <8 x double> zeroinitializer
+  ret <8 x double> %r
+}
+
+define   <16 x i32> @_xmm16xi32(<16 x i32> %a) {
+; CHECK-LABEL: _xmm16xi32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> zeroinitializer
+  ret <16 x i32> %b
+}
+
+define   <16 x float> @_xmm16xfloat(<16 x float> %a) {
+; CHECK-LABEL: _xmm16xfloat:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> zeroinitializer
+  ret <16 x float> %b
+}
+
+define <16 x i32> @test_vbroadcast() {
+; CHECK-LABEL: test_vbroadcast:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # sched: [1:0.50]
+; CHECK-NEXT:    vcmpunordps %zmm0, %zmm0, %k0
+; CHECK-NEXT:    vpmovm2d %k0, %zmm0
+; CHECK-NEXT:    knotw %k0, %k1
+; CHECK-NEXT:    vmovdqa32 %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %0 = sext <16 x i1> zeroinitializer to <16 x i32>
+  %1 = fcmp uno <16 x float> undef, zeroinitializer
+  %2 = sext <16 x i1> %1 to <16 x i32>
+  %3 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> %2
+  ret <16 x i32> %3
+}
+
+; We implement the set1 intrinsics with vector initializers.  Verify that the
+; IR generated will produce broadcasts at the end.
+define <8 x double> @test_set1_pd(double %d) #2 {
+; CHECK-LABEL: test_set1_pd:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %vecinit.i = insertelement <8 x double> undef, double %d, i32 0
+  %vecinit1.i = insertelement <8 x double> %vecinit.i, double %d, i32 1
+  %vecinit2.i = insertelement <8 x double> %vecinit1.i, double %d, i32 2
+  %vecinit3.i = insertelement <8 x double> %vecinit2.i, double %d, i32 3
+  %vecinit4.i = insertelement <8 x double> %vecinit3.i, double %d, i32 4
+  %vecinit5.i = insertelement <8 x double> %vecinit4.i, double %d, i32 5
+  %vecinit6.i = insertelement <8 x double> %vecinit5.i, double %d, i32 6
+  %vecinit7.i = insertelement <8 x double> %vecinit6.i, double %d, i32 7
+  ret <8 x double> %vecinit7.i
+}
+
+define <8 x i64> @test_set1_epi64(i64 %d) #2 {
+; CHECK-LABEL: test_set1_epi64:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vpbroadcastq %rdi, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %vecinit.i = insertelement <8 x i64> undef, i64 %d, i32 0
+  %vecinit1.i = insertelement <8 x i64> %vecinit.i, i64 %d, i32 1
+  %vecinit2.i = insertelement <8 x i64> %vecinit1.i, i64 %d, i32 2
+  %vecinit3.i = insertelement <8 x i64> %vecinit2.i, i64 %d, i32 3
+  %vecinit4.i = insertelement <8 x i64> %vecinit3.i, i64 %d, i32 4
+  %vecinit5.i = insertelement <8 x i64> %vecinit4.i, i64 %d, i32 5
+  %vecinit6.i = insertelement <8 x i64> %vecinit5.i, i64 %d, i32 6
+  %vecinit7.i = insertelement <8 x i64> %vecinit6.i, i64 %d, i32 7
+  ret <8 x i64> %vecinit7.i
+}
+
+define <16 x float> @test_set1_ps(float %f) #2 {
+; CHECK-LABEL: test_set1_ps:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %vecinit.i = insertelement <16 x float> undef, float %f, i32 0
+  %vecinit1.i = insertelement <16 x float> %vecinit.i, float %f, i32 1
+  %vecinit2.i = insertelement <16 x float> %vecinit1.i, float %f, i32 2
+  %vecinit3.i = insertelement <16 x float> %vecinit2.i, float %f, i32 3
+  %vecinit4.i = insertelement <16 x float> %vecinit3.i, float %f, i32 4
+  %vecinit5.i = insertelement <16 x float> %vecinit4.i, float %f, i32 5
+  %vecinit6.i = insertelement <16 x float> %vecinit5.i, float %f, i32 6
+  %vecinit7.i = insertelement <16 x float> %vecinit6.i, float %f, i32 7
+  %vecinit8.i = insertelement <16 x float> %vecinit7.i, float %f, i32 8
+  %vecinit9.i = insertelement <16 x float> %vecinit8.i, float %f, i32 9
+  %vecinit10.i = insertelement <16 x float> %vecinit9.i, float %f, i32 10
+  %vecinit11.i = insertelement <16 x float> %vecinit10.i, float %f, i32 11
+  %vecinit12.i = insertelement <16 x float> %vecinit11.i, float %f, i32 12
+  %vecinit13.i = insertelement <16 x float> %vecinit12.i, float %f, i32 13
+  %vecinit14.i = insertelement <16 x float> %vecinit13.i, float %f, i32 14
+  %vecinit15.i = insertelement <16 x float> %vecinit14.i, float %f, i32 15
+  ret <16 x float> %vecinit15.i
+}
+
+define <16 x i32> @test_set1_epi32(i32 %f) #2 {
+; CHECK-LABEL: test_set1_epi32:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vpbroadcastd %edi, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %vecinit.i = insertelement <16 x i32> undef, i32 %f, i32 0
+  %vecinit1.i = insertelement <16 x i32> %vecinit.i, i32 %f, i32 1
+  %vecinit2.i = insertelement <16 x i32> %vecinit1.i, i32 %f, i32 2
+  %vecinit3.i = insertelement <16 x i32> %vecinit2.i, i32 %f, i32 3
+  %vecinit4.i = insertelement <16 x i32> %vecinit3.i, i32 %f, i32 4
+  %vecinit5.i = insertelement <16 x i32> %vecinit4.i, i32 %f, i32 5
+  %vecinit6.i = insertelement <16 x i32> %vecinit5.i, i32 %f, i32 6
+  %vecinit7.i = insertelement <16 x i32> %vecinit6.i, i32 %f, i32 7
+  %vecinit8.i = insertelement <16 x i32> %vecinit7.i, i32 %f, i32 8
+  %vecinit9.i = insertelement <16 x i32> %vecinit8.i, i32 %f, i32 9
+  %vecinit10.i = insertelement <16 x i32> %vecinit9.i, i32 %f, i32 10
+  %vecinit11.i = insertelement <16 x i32> %vecinit10.i, i32 %f, i32 11
+  %vecinit12.i = insertelement <16 x i32> %vecinit11.i, i32 %f, i32 12
+  %vecinit13.i = insertelement <16 x i32> %vecinit12.i, i32 %f, i32 13
+  %vecinit14.i = insertelement <16 x i32> %vecinit13.i, i32 %f, i32 14
+  %vecinit15.i = insertelement <16 x i32> %vecinit14.i, i32 %f, i32 15
+  ret <16 x i32> %vecinit15.i
+}
+
+; We implement the scalar broadcast intrinsics with vector initializers.
+; Verify that the IR generated will produce the broadcast at the end.
+define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a) {
+; CHECK-LABEL: test_mm512_broadcastsd_pd:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+entry:
+  %0 = extractelement <2 x double> %a, i32 0
+  %vecinit.i = insertelement <8 x double> undef, double %0, i32 0
+  %vecinit1.i = insertelement <8 x double> %vecinit.i, double %0, i32 1
+  %vecinit2.i = insertelement <8 x double> %vecinit1.i, double %0, i32 2
+  %vecinit3.i = insertelement <8 x double> %vecinit2.i, double %0, i32 3
+  %vecinit4.i = insertelement <8 x double> %vecinit3.i, double %0, i32 4
+  %vecinit5.i = insertelement <8 x double> %vecinit4.i, double %0, i32 5
+  %vecinit6.i = insertelement <8 x double> %vecinit5.i, double %0, i32 6
+  %vecinit7.i = insertelement <8 x double> %vecinit6.i, double %0, i32 7
+  ret <8 x double> %vecinit7.i
+}
+
+define <16 x float> @suff_test1(<8 x float>%a)  {
+; CHECK-LABEL: suff_test1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x float> %a, <8 x float> undef, <16 x i32> zeroinitializer
+  ret <16 x float>%res
+}
+
+define <8 x double> @suff_test2(<4 x double>%a)  {
+; CHECK-LABEL: suff_test2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <4 x double> %a, <4 x double> undef, <8 x i32> zeroinitializer
+  ret <8 x double>%res
+}
+
+define <64 x i8> @_invec32xi8(<32 x i8>%a)  {
+; CHECK-LABEL: _invec32xi8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpbroadcastb %xmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <32 x i8> %a, <32 x i8> undef, <64 x i32> zeroinitializer
+  ret <64 x i8>%res
+}
+
+define <32 x i16> @_invec16xi16(<16 x i16>%a)  {
+; CHECK-LABEL: _invec16xi16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <16 x i16> %a, <16 x i16> undef, <32 x i32> zeroinitializer
+  ret <32 x i16>%res
+}
+
+define <16 x i32> @_invec8xi32(<8 x i32>%a)  {
+; CHECK-LABEL: _invec8xi32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vbroadcastss %xmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x i32> %a, <8 x i32> undef, <16 x i32> zeroinitializer
+  ret <16 x i32>%res
+}
+
+define <8 x i64> @_invec4xi64(<4 x i64>%a)  {
+; CHECK-LABEL: _invec4xi64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vbroadcastsd %xmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <4 x i64> %a, <4 x i64> undef, <8 x i32> zeroinitializer
+  ret <8 x i64>%res
+}
+
+declare void @func_f32(float)
+define <16 x float> @broadcast_ss_spill(float %x) {
+; CHECK-LABEL: broadcast_ss_spill:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    subq $24, %rsp # sched: [1:0.25]
+; CHECK-NEXT:  .Lcfi0:
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    vaddss %xmm0, %xmm0, %xmm0 # sched: [4:0.50]
+; CHECK-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill sched: [1:1.00]
+; CHECK-NEXT:    # sched: [1:1.00]
+; CHECK-NEXT:    callq func_f32
+; CHECK-NEXT:    vbroadcastss (%rsp), %zmm0 # 16-byte Folded Reload sched: [5:?]
+; CHECK-NEXT:    addq $24, %rsp # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a  = fadd float %x, %x
+  call void @func_f32(float %a)
+  %b = insertelement <16 x float> undef, float %a, i32 0
+  %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer
+  ret <16 x float> %c
+}
+
+declare void @func_f64(double)
+define <8 x double> @broadcast_sd_spill(double %x) {
+; CHECK-LABEL: broadcast_sd_spill:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    subq $24, %rsp # sched: [1:0.25]
+; CHECK-NEXT:  .Lcfi1:
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    vaddsd %xmm0, %xmm0, %xmm0 # sched: [4:0.50]
+; CHECK-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill sched: [1:1.00]
+; CHECK-NEXT:    # sched: [1:1.00]
+; CHECK-NEXT:    callq func_f64
+; CHECK-NEXT:    vbroadcastsd (%rsp), %zmm0 # 16-byte Folded Reload sched: [5:?]
+; CHECK-NEXT:    addq $24, %rsp # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %a  = fadd double %x, %x
+  call void @func_f64(double %a)
+  %b = insertelement <8 x double> undef, double %a, i32 0
+  %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer
+  ret <8 x double> %c
+}

Propchange: llvm/trunk/test/CodeGen/X86/avx512-schedule.ll
------------------------------------------------------------------------------
    svn:executable = *

Added: llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll?rev=314594&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll (added)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll Sat Sep 30 07:30:23 2017
@@ -0,0 +1,10535 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK
+; This test is an assembly of avx512 shuffling instructions to check their scheduling
+
+define <16 x i16> @test_16xi16_perm_mask0(<16 x i16> %vec) {
+; CHECK-LABEL: test_16xi16_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [1:0.50]
+; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
+  ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %vec2) {
+; CHECK-LABEL: test_masked_16xi16_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [1:0.50]
+; CHECK-NEXT:    movw $-10197, %ax # imm = 0xD82B
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_mask0(<16 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [1:0.50]
+; CHECK-NEXT:    movw $-10197, %ax # imm = 0xD82B
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %vec2) {
+; CHECK-LABEL: test_masked_16xi16_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [1:0.50]
+; CHECK-NEXT:    movw $-15864, %ax # imm = 0xC208
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
+  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_mask1(<16 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [1:0.50]
+; CHECK-NEXT:    movw $-15864, %ax # imm = 0xC208
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
+  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %vec2) {
+; CHECK-LABEL: test_masked_16xi16_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [1:0.50]
+; CHECK-NEXT:    movw $27562, %ax # imm = 0x6BAA
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_mask2(<16 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [1:0.50]
+; CHECK-NEXT:    movw $27562, %ax # imm = 0x6BAA
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
+}
+define <16 x i16> @test_16xi16_perm_mask3(<16 x i16> %vec) {
+; CHECK-LABEL: test_16xi16_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [1:0.50]
+; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
+  ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %vec2) {
+; CHECK-LABEL: test_masked_16xi16_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [1:0.50]
+; CHECK-NEXT:    movw $16968, %ax # imm = 0x4248
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
+  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_mask3(<16 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [1:0.50]
+; CHECK-NEXT:    movw $16968, %ax # imm = 0x4248
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
+  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
+}
+define <16 x i16> @test_16xi16_perm_mem_mask0(<16 x i16>* %vp) {
+; CHECK-LABEL: test_16xi16_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [1:0.50]
+; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
+  ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2) {
+; CHECK-LABEL: test_masked_16xi16_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [1:0.50]
+; CHECK-NEXT:    movw $-27811, %ax # imm = 0x935D
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
+  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_mem_mask0(<16 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [1:0.50]
+; CHECK-NEXT:    movw $-27811, %ax # imm = 0x935D
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
+  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2) {
+; CHECK-LABEL: test_masked_16xi16_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [1:0.50]
+; CHECK-NEXT:    movw $19027, %ax # imm = 0x4A53
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_mem_mask1(<16 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [1:0.50]
+; CHECK-NEXT:    movw $19027, %ax # imm = 0x4A53
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2) {
+; CHECK-LABEL: test_masked_16xi16_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [1:0.50]
+; CHECK-NEXT:    movw $12412, %ax # imm = 0x307C
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
+  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_mem_mask2(<16 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [1:0.50]
+; CHECK-NEXT:    movw $12412, %ax # imm = 0x307C
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
+  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_16xi16_perm_mem_mask3(<16 x i16>* %vp) {
+; CHECK-LABEL: test_16xi16_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [1:0.50]
+; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
+  ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2) {
+; CHECK-LABEL: test_masked_16xi16_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [1:0.50]
+; CHECK-NEXT:    movw $12238, %ax # imm = 0x2FCE
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
+  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_mem_mask3(<16 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [1:0.50]
+; CHECK-NEXT:    movw $12238, %ax # imm = 0x2FCE
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
+  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
+}
+
+define <32 x i16> @test_32xi16_perm_mask0(<32 x i16> %vec) {
+; CHECK-LABEL: test_32xi16_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [5:0.50]
+; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
+  ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %vec2) {
+; CHECK-LABEL: test_masked_32xi16_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [5:0.50]
+; CHECK-NEXT:    movl $948454498, %eax # imm = 0x38884462
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
+  %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [5:0.50]
+; CHECK-NEXT:    movl $948454498, %eax # imm = 0x38884462
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 16, i32 1, i32 3, i32 31, i32 6, i32 11, i32 23, i32 26, i32 29, i32 5, i32 21, i32 30, i32 1, i32 21, i32 27, i32 10, i32 8, i32 19, i32 14, i32 5, i32 15, i32 13, i32 18, i32 16, i32 9, i32 11, i32 26, i32 8, i32 17, i32 0, i32 23, i32 10>
+  %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %vec2) {
+; CHECK-LABEL: test_masked_32xi16_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [5:0.50]
+; CHECK-NEXT:    movl $-1516442487, %eax # imm = 0xA59CEC89
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
+  %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> %vec2
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [5:0.50]
+; CHECK-NEXT:    movl $-1516442487, %eax # imm = 0xA59CEC89
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 8, i32 7, i32 30, i32 11, i32 9, i32 11, i32 30, i32 20, i32 19, i32 22, i32 12, i32 13, i32 20, i32 0, i32 6, i32 10, i32 7, i32 20, i32 12, i32 28, i32 18, i32 13, i32 12, i32 22, i32 13, i32 21, i32 1, i32 14, i32 8, i32 5, i32 16>
+  %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %vec2) {
+; CHECK-LABEL: test_masked_32xi16_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [5:0.50]
+; CHECK-NEXT:    movl $1504501134, %eax # imm = 0x59ACDD8E
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
+  %res = select <32 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [5:0.50]
+; CHECK-NEXT:    movl $1504501134, %eax # imm = 0x59ACDD8E
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 15, i32 17, i32 24, i32 28, i32 15, i32 9, i32 14, i32 25, i32 28, i32 25, i32 6, i32 31, i32 20, i32 2, i32 23, i32 31, i32 12, i32 21, i32 10, i32 6, i32 22, i32 0, i32 26, i32 16, i32 3, i32 3, i32 20, i32 27, i32 8, i32 31, i32 3, i32 27>
+  %res = select <32 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  ret <32 x i16> %res
+}
+define <32 x i16> @test_32xi16_perm_mask3(<32 x i16> %vec) {
+; CHECK-LABEL: test_32xi16_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [5:0.50]
+; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
+  ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %vec2) {
+; CHECK-LABEL: test_masked_32xi16_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [5:0.50]
+; CHECK-NEXT:    movl $774459490, %eax # imm = 0x2E295062
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
+  %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [5:0.50]
+; CHECK-NEXT:    movl $774459490, %eax # imm = 0x2E295062
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 12, i32 2, i32 8, i32 14, i32 25, i32 27, i32 4, i32 16, i32 20, i32 11, i32 27, i32 8, i32 0, i32 1, i32 21, i32 17, i32 30, i32 30, i32 29, i32 1, i32 23, i32 22, i32 20, i32 22, i32 28, i32 20, i32 11, i32 17, i32 6, i32 18, i32 0, i32 4>
+  %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  ret <32 x i16> %res
+}
+define <32 x i16> @test_32xi16_perm_mem_mask0(<32 x i16>* %vp) {
+; CHECK-LABEL: test_32xi16_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [5:0.50]
+; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
+  ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2) {
+; CHECK-LABEL: test_masked_32xi16_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [5:0.50]
+; CHECK-NEXT:    movl $1431978123, %eax # imm = 0x555A408B
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
+  %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(<32 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [5:0.50]
+; CHECK-NEXT:    movl $1431978123, %eax # imm = 0x555A408B
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 19, i32 1, i32 5, i32 31, i32 9, i32 12, i32 17, i32 9, i32 15, i32 7, i32 1, i32 5, i32 16, i32 2, i32 12, i32 10, i32 13, i32 3, i32 29, i32 15, i32 26, i32 31, i32 10, i32 15, i32 22, i32 13, i32 9, i32 23, i32 28, i32 29, i32 20, i32 12>
+  %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2) {
+; CHECK-LABEL: test_masked_32xi16_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [5:0.50]
+; CHECK-NEXT:    movl $-903561653, %eax # imm = 0xCA24BE4B
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 31, i32 20, i32 2, i32 2, i32 23, i32 1, i32 0, i32 12, i32 16, i32 14, i32 15, i32 18, i32 21, i32 13, i32 11, i32 31, i32 8, i32 24, i32 13, i32 11, i32 2, i32 27, i32 22, i32 28, i32 14, i32 21, i32 3, i32 12, i32 6, i32 1, i32 30, i32 6>
+  %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <32 x i16> %shuf, <32 x i16> %vec2
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(<32 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [5:0.50]
+; CHECK-NEXT:    movl $-903561653, %eax # imm = 0xCA24BE4B
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 31, i32 20, i32 2, i32 2, i32 23, i32 1, i32 0, i32 12, i32 16, i32 14, i32 15, i32 18, i32 21, i32 13, i32 11, i32 31, i32 8, i32 24, i32 13, i32 11, i32 2, i32 27, i32 22, i32 28, i32 14, i32 21, i32 3, i32 12, i32 6, i32 1, i32 30, i32 6>
+  %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2) {
+; CHECK-LABEL: test_masked_32xi16_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [5:0.50]
+; CHECK-NEXT:    movl $-1209035774, %eax # imm = 0xB7EF9402
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 4, i32 6, i32 12, i32 17, i32 4, i32 31, i32 31, i32 4, i32 12, i32 21, i32 28, i32 15, i32 29, i32 10, i32 15, i32 15, i32 21, i32 6, i32 19, i32 7, i32 10, i32 30, i32 28, i32 26, i32 1, i32 4, i32 8, i32 25, i32 26, i32 18, i32 22, i32 25>
+  %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> %vec2
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(<32 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [5:0.50]
+; CHECK-NEXT:    movl $-1209035774, %eax # imm = 0xB7EF9402
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 4, i32 6, i32 12, i32 17, i32 4, i32 31, i32 31, i32 4, i32 12, i32 21, i32 28, i32 15, i32 29, i32 10, i32 15, i32 15, i32 21, i32 6, i32 19, i32 7, i32 10, i32 30, i32 28, i32 26, i32 1, i32 4, i32 8, i32 25, i32 26, i32 18, i32 22, i32 25>
+  %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_32xi16_perm_mem_mask3(<32 x i16>* %vp) {
+; CHECK-LABEL: test_32xi16_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [5:0.50]
+; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
+  ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2) {
+; CHECK-LABEL: test_masked_32xi16_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [5:0.50]
+; CHECK-NEXT:    movl $1452798329, %eax # imm = 0x5697F179
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
+  %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_mem_mask3(<32 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [5:0.50]
+; CHECK-NEXT:    movl $1452798329, %eax # imm = 0x5697F179
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 27, i32 1, i32 7, i32 1, i32 0, i32 27, i32 10, i32 5, i32 4, i32 20, i32 30, i32 16, i32 28, i32 16, i32 18, i32 21, i32 25, i32 24, i32 31, i32 23, i32 28, i32 6, i32 17, i32 19, i32 26, i32 15, i32 25, i32 12, i32 18, i32 27>
+  %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  ret <32 x i16> %res
+}
+
+define <8 x i32> @test_8xi32_perm_mask0(<8 x i32> %vec) {
+; CHECK-LABEL: test_8xi32_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6] sched: [1:0.50]
+; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
+  ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2) {
+; CHECK-LABEL: test_masked_8xi32_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] sched: [1:0.50]
+; CHECK-NEXT:    movb $-53, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
+  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <8 x i32> %shuf, <8 x i32> %vec2
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec) {
+; CHECK-LABEL: test_masked_z_8xi32_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6] sched: [1:0.50]
+; CHECK-NEXT:    movb $-53, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd %ymm0, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 2, i32 0, i32 6, i32 7, i32 2, i32 3, i32 6>
+  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2) {
+; CHECK-LABEL: test_masked_8xi32_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] sched: [1:0.50]
+; CHECK-NEXT:    movb $-89, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> %vec2
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec) {
+; CHECK-LABEL: test_masked_z_8xi32_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,5,1,2,6,0,0,3] sched: [1:0.50]
+; CHECK-NEXT:    movb $-89, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd %ymm0, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 5, i32 1, i32 2, i32 6, i32 0, i32 0, i32 3>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2) {
+; CHECK-LABEL: test_masked_8xi32_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] sched: [1:0.50]
+; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
+  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> %vec2
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec) {
+; CHECK-LABEL: test_masked_z_8xi32_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,6,5,5,1,7,3,4] sched: [1:0.50]
+; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd %ymm0, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 6, i32 5, i32 5, i32 1, i32 7, i32 3, i32 4>
+  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_perm_mask3(<8 x i32> %vec) {
+; CHECK-LABEL: test_8xi32_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0] sched: [1:0.50]
+; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
+  ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2) {
+; CHECK-LABEL: test_masked_8xi32_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] sched: [1:0.50]
+; CHECK-NEXT:    movb $47, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> %vec2
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec) {
+; CHECK-LABEL: test_masked_z_8xi32_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0] sched: [1:0.50]
+; CHECK-NEXT:    movb $47, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd %ymm0, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 0, i32 3, i32 1, i32 0, i32 4, i32 5, i32 0>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
+; CHECK-LABEL: test_8xi32_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] sched: [1:0.50]
+; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i32>, <8 x i32>* %vp
+  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
+  ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2) {
+; CHECK-LABEL: test_masked_8xi32_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [1:0.50]
+; CHECK-NEXT:    movb $-116, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i32>, <8 x i32>* %vp
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> %vec2
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
+; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] sched: [1:0.50]
+; CHECK-NEXT:    movb $-116, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i32>, <8 x i32>* %vp
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 7, i32 4, i32 3, i32 5, i32 2, i32 0, i32 5>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2) {
+; CHECK-LABEL: test_masked_8xi32_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [1:0.50]
+; CHECK-NEXT:    movb $89, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i32>, <8 x i32>* %vp
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 6, i32 1, i32 7, i32 6, i32 7, i32 6, i32 5>
+  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> %vec2
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp) {
+; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [4,6,1,7,6,7,6,5] sched: [1:0.50]
+; CHECK-NEXT:    movb $89, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i32>, <8 x i32>* %vp
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 4, i32 6, i32 1, i32 7, i32 6, i32 7, i32 6, i32 5>
+  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2) {
+; CHECK-LABEL: test_masked_8xi32_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [1:0.50]
+; CHECK-NEXT:    movb $98, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i32>, <8 x i32>* %vp
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 4, i32 6, i32 1, i32 6, i32 3, i32 6, i32 3>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> %vec2
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp) {
+; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [6,4,6,1,6,3,6,3] sched: [1:0.50]
+; CHECK-NEXT:    movb $98, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i32>, <8 x i32>* %vp
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 4, i32 6, i32 1, i32 6, i32 3, i32 6, i32 3>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_perm_mem_mask3(<8 x i32>* %vp) {
+; CHECK-LABEL: test_8xi32_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] sched: [1:0.50]
+; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i32>, <8 x i32>* %vp
+  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
+  ret <8 x i32> %res
+}
+define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2) {
+; CHECK-LABEL: test_masked_8xi32_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [1:0.50]
+; CHECK-NEXT:    movb $-58, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i32>, <8 x i32>* %vp
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x i32> %shuf, <8 x i32> %vec2
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp) {
+; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] sched: [1:0.50]
+; CHECK-NEXT:    movb $-58, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i32>, <8 x i32>* %vp
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 6, i32 0, i32 0, i32 7, i32 3, i32 7, i32 7, i32 5>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec) {
+; CHECK-LABEL: test_16xi32_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [5:0.50]
+; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
+  ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2) {
+; CHECK-LABEL: test_masked_16xi32_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [5:0.50]
+; CHECK-NEXT:    movw $-28063, %ax # imm = 0x9261
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
+  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1>, <16 x i32> %shuf, <16 x i32> %vec2
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec) {
+; CHECK-LABEL: test_masked_z_16xi32_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [5:0.50]
+; CHECK-NEXT:    movw $-28063, %ax # imm = 0x9261
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 14, i32 14, i32 6, i32 1, i32 12, i32 11, i32 0, i32 7>
+  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2) {
+; CHECK-LABEL: test_masked_16xi32_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [5:0.50]
+; CHECK-NEXT:    movw $14154, %ax # imm = 0x374A
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec) {
+; CHECK-LABEL: test_masked_z_16xi32_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [5:0.50]
+; CHECK-NEXT:    movw $14154, %ax # imm = 0x374A
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 10, i32 0, i32 14, i32 15, i32 11, i32 1, i32 1, i32 5, i32 0, i32 5, i32 0, i32 15, i32 13, i32 1, i32 14, i32 3>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2) {
+; CHECK-LABEL: test_masked_16xi32_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [5:0.50]
+; CHECK-NEXT:    movw $6126, %ax # imm = 0x17EE
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
+  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec) {
+; CHECK-LABEL: test_masked_z_16xi32_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [5:0.50]
+; CHECK-NEXT:    movw $6126, %ax # imm = 0x17EE
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 10, i32 15, i32 1, i32 0, i32 5, i32 0, i32 9, i32 13, i32 2, i32 1, i32 5, i32 15, i32 2, i32 15, i32 5>
+  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) {
+; CHECK-LABEL: test_16xi32_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [5:0.50]
+; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
+  ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2) {
+; CHECK-LABEL: test_masked_16xi32_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [5:0.50]
+; CHECK-NEXT:    movw $-11837, %ax # imm = 0xD1C3
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <16 x i32> %shuf, <16 x i32> %vec2
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec) {
+; CHECK-LABEL: test_masked_z_16xi32_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [5:0.50]
+; CHECK-NEXT:    movw $-11837, %ax # imm = 0xD1C3
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 4, i32 14, i32 15, i32 10, i32 2, i32 15, i32 1, i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
+; CHECK-LABEL: test_16xi32_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [5:0.50]
+; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i32>, <16 x i32>* %vp
+  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
+  ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2) {
+; CHECK-LABEL: test_masked_16xi32_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [5:0.50]
+; CHECK-NEXT:    movw $19075, %ax # imm = 0x4A83
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i32>, <16 x i32>* %vp
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
+; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [5:0.50]
+; CHECK-NEXT:    movw $19075, %ax # imm = 0x4A83
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i32>, <16 x i32>* %vp
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 1, i32 6, i32 8, i32 11, i32 2, i32 6, i32 10, i32 1, i32 7, i32 5, i32 15, i32 0, i32 6, i32 6>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2) {
+; CHECK-LABEL: test_masked_16xi32_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [5:0.50]
+; CHECK-NEXT:    movw $27511, %ax # imm = 0x6B77
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i32>, <16 x i32>* %vp
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 5, i32 3, i32 4, i32 7, i32 15, i32 12, i32 4, i32 8, i32 11, i32 12, i32 7, i32 6, i32 12, i32 6, i32 3>
+  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp) {
+; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm0 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [5:0.50]
+; CHECK-NEXT:    movw $27511, %ax # imm = 0x6B77
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i32>, <16 x i32>* %vp
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 5, i32 3, i32 4, i32 7, i32 15, i32 12, i32 4, i32 8, i32 11, i32 12, i32 7, i32 6, i32 12, i32 6, i32 3>
+  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2) {
+; CHECK-LABEL: test_masked_16xi32_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [5:0.50]
+; CHECK-NEXT:    movw $3032, %ax # imm = 0xBD8
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i32>, <16 x i32>* %vp
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 14, i32 2, i32 7, i32 10, i32 7, i32 3, i32 0, i32 11, i32 9, i32 0, i32 4, i32 12, i32 10, i32 8, i32 2>
+  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp) {
+; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm0 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [5:0.50]
+; CHECK-NEXT:    movw $3032, %ax # imm = 0xBD8
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i32>, <16 x i32>* %vp
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 7, i32 14, i32 2, i32 7, i32 10, i32 7, i32 3, i32 0, i32 11, i32 9, i32 0, i32 4, i32 12, i32 10, i32 8, i32 2>
+  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
+; CHECK-LABEL: test_16xi32_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [5:0.50]
+; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i32>, <16 x i32>* %vp
+  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
+  ret <16 x i32> %res
+}
+define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2) {
+; CHECK-LABEL: test_masked_16xi32_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [5:0.50]
+; CHECK-NEXT:    movw $8666, %ax # imm = 0x21DA
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i32>, <16 x i32>* %vp
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
+; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa32 {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [5:0.50]
+; CHECK-NEXT:    movw $8666, %ax # imm = 0x21DA
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermd (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i32>, <16 x i32>* %vp
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 11, i32 7, i32 10, i32 12, i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+
+define <4 x i64> @test_4xi64_perm_mask0(<4 x i64> %vec) {
+; CHECK-LABEL: test_4xi64_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
+  ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %vec2) {
+; CHECK-LABEL: test_masked_4xi64_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $4, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
+  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x i64> %shuf, <4 x i64> %vec2
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_4xi64_perm_mask0(<4 x i64> %vec) {
+; CHECK-LABEL: test_masked_z_4xi64_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $4, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
+  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %vec2) {
+; CHECK-LABEL: test_masked_4xi64_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> %vec2
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_4xi64_perm_mask1(<4 x i64> %vec) {
+; CHECK-LABEL: test_masked_z_4xi64_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %vec2) {
+; CHECK-LABEL: test_masked_4xi64_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
+  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> %vec2
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_4xi64_perm_mask2(<4 x i64> %vec) {
+; CHECK-LABEL: test_masked_z_4xi64_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 1>
+  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_perm_mask3(<4 x i64> %vec) {
+; CHECK-LABEL: test_4xi64_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
+  ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %vec2) {
+; CHECK-LABEL: test_masked_4xi64_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
+  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> %vec2
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_4xi64_perm_mask3(<4 x i64> %vec) {
+; CHECK-LABEL: test_masked_z_4xi64_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 3>
+  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_perm_mem_mask0(<4 x i64>* %vp) {
+; CHECK-LABEL: test_4xi64_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,1,2,0] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x i64>, <4 x i64>* %vp
+  %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
+  ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %vec2) {
+; CHECK-LABEL: test_masked_4xi64_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $5, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x i64>, <4 x i64>* %vp
+  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
+  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x i64> %shuf, <4 x i64> %vec2
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_4xi64_perm_mem_mask0(<4 x i64>* %vp) {
+; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $5, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x i64>, <4 x i64>* %vp
+  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 0>
+  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %vec2) {
+; CHECK-LABEL: test_masked_4xi64_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x i64>, <4 x i64>* %vp
+  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> %vec2
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_4xi64_perm_mem_mask1(<4 x i64>* %vp) {
+; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x i64>, <4 x i64>* %vp
+  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 1, i32 1, i32 1>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %vec2) {
+; CHECK-LABEL: test_masked_4xi64_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $8, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x i64>, <4 x i64>* %vp
+  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
+  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> %vec2
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_4xi64_perm_mem_mask2(<4 x i64>* %vp) {
+; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $8, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x i64>, <4 x i64>* %vp
+  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
+  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_perm_mem_mask3(<4 x i64>* %vp) {
+; CHECK-LABEL: test_4xi64_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = mem[2,0,1,3] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x i64>, <4 x i64>* %vp
+  %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
+  ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %vec2) {
+; CHECK-LABEL: test_masked_4xi64_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x i64>, <4 x i64>* %vp
+  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
+  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x i64> %shuf, <4 x i64> %vec2
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_4xi64_perm_mem_mask3(<4 x i64>* %vp) {
+; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x i64>, <4 x i64>* %vp
+  %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> <i32 2, i32 0, i32 1, i32 3>
+  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  ret <4 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_perm_mask0(<8 x i64> %vec) {
+; CHECK-LABEL: test_8xi64_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] sched: [5:0.50]
+; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2) {
+; CHECK-LABEL: test_masked_8xi64_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [5:0.50]
+; CHECK-NEXT:    movb $3, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
+  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec2
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] sched: [5:0.50]
+; CHECK-NEXT:    movb $3, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 7, i32 6, i32 5, i32 5, i32 1, i32 6>
+  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %vec2) {
+; CHECK-LABEL: test_masked_8xi64_perm_imm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-122, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> %vec2
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mask1(<8 x i64> %vec) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-122, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 1, i32 5, i32 4, i32 5, i32 5>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2) {
+; CHECK-LABEL: test_masked_8xi64_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [5:0.50]
+; CHECK-NEXT:    movb $17, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
+  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec2
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_mask2(<8 x i64> %vec) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,3,7,3,3,5,4,1] sched: [5:0.50]
+; CHECK-NEXT:    movb $17, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 3, i32 3, i32 5, i32 4, i32 1>
+  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_perm_imm_mask3(<8 x i64> %vec) {
+; CHECK-LABEL: test_8xi64_perm_imm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[3,1,3,1,7,5,7,5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %vec2) {
+; CHECK-LABEL: test_masked_8xi64_perm_imm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-35, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> %vec2
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mask3(<8 x i64> %vec) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-35, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 1, i32 7, i32 5, i32 7, i32 5>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2) {
+; CHECK-LABEL: test_masked_8xi64_perm_mask4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [5:0.50]
+; CHECK-NEXT:    movb $-81, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> %vec2
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_mask4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [6,3,1,1,7,4,0,3] sched: [5:0.50]
+; CHECK-NEXT:    movb $-81, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 6, i32 3, i32 1, i32 1, i32 7, i32 4, i32 0, i32 3>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %vec2) {
+; CHECK-LABEL: test_masked_8xi64_perm_imm_mask5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-67, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> %vec2
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mask5(<8 x i64> %vec) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-67, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_perm_mask6(<8 x i64> %vec) {
+; CHECK-LABEL: test_8xi64_perm_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] sched: [5:0.50]
+; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2) {
+; CHECK-LABEL: test_masked_8xi64_perm_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [5:0.50]
+; CHECK-NEXT:    movb $-86, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> %vec2
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] sched: [5:0.50]
+; CHECK-NEXT:    movb $-86, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 4, i32 4, i32 5, i32 4, i32 2, i32 7>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %vec2) {
+; CHECK-LABEL: test_masked_8xi64_perm_imm_mask7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec2
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mask7(<8 x i64> %vec) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 7, i32 7, i32 7, i32 7>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_perm_mem_mask0(<8 x i64>* %vp) {
+; CHECK-LABEL: test_8xi64_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [5:0.50]
+; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i64>, <8 x i64>* %vp
+  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %vec2) {
+; CHECK-LABEL: test_masked_8xi64_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [5:0.50]
+; CHECK-NEXT:    movb $-108, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i64>, <8 x i64>* %vp
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> %vec2
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(<8 x i64>* %vp) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [5:0.50]
+; CHECK-NEXT:    movb $-108, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i64>, <8 x i64>* %vp
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 1, i32 6, i32 5, i32 7, i32 3, i32 7, i32 3>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %vec2) {
+; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $125, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i64>, <8 x i64>* %vp
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 0, i32 5, i32 5, i32 5, i32 4>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0>, <8 x i64> %shuf, <8 x i64> %vec2
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $125, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i64>, <8 x i64>* %vp
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 0, i32 5, i32 5, i32 5, i32 4>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %vec2) {
+; CHECK-LABEL: test_masked_8xi64_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [5:0.50]
+; CHECK-NEXT:    movb $-77, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i64>, <8 x i64>* %vp
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 1, i32 4, i32 1, i32 1, i32 5, i32 5>
+  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> %vec2
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_mem_mask2(<8 x i64>* %vp) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,2,1,4,1,1,5,5] sched: [5:0.50]
+; CHECK-NEXT:    movb $-77, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i64>, <8 x i64>* %vp
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 2, i32 1, i32 4, i32 1, i32 1, i32 5, i32 5>
+  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp) {
+; CHECK-LABEL: test_8xi64_perm_imm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i64>, <8 x i64>* %vp
+  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %vec2) {
+; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $55, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i64>, <8 x i64>* %vp
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec2
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $55, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i64>, <8 x i64>* %vp
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 1, i32 5, i32 7, i32 5, i32 5>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %vec2) {
+; CHECK-LABEL: test_masked_8xi64_perm_mem_mask4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [5:0.50]
+; CHECK-NEXT:    movb $68, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i64>, <8 x i64>* %vp
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 0, i32 7, i32 0, i32 3, i32 5, i32 0, i32 6>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x i64> %shuf, <8 x i64> %vec2
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(<8 x i64>* %vp) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [5,0,7,0,3,5,0,6] sched: [5:0.50]
+; CHECK-NEXT:    movb $68, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i64>, <8 x i64>* %vp
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 5, i32 0, i32 7, i32 0, i32 3, i32 5, i32 0, i32 6>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %vec2) {
+; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i64>, <8 x i64>* %vp
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 0, i32 0, i32 7, i32 5, i32 4, i32 4>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec2
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i64>, <8 x i64>* %vp
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 1, i32 0, i32 0, i32 7, i32 5, i32 4, i32 4>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_perm_mem_mask6(<8 x i64>* %vp) {
+; CHECK-LABEL: test_8xi64_perm_mem_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [5:0.50]
+; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i64>, <8 x i64>* %vp
+  %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_masked_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %vec2) {
+; CHECK-LABEL: test_masked_8xi64_perm_mem_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [5:0.50]
+; CHECK-NEXT:    movb $42, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i64>, <8 x i64>* %vp
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec2
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(<8 x i64>* %vp) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [5:0.50]
+; CHECK-NEXT:    movb $42, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i64>, <8 x i64>* %vp
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 0, i32 6, i32 3, i32 7, i32 3, i32 0, i32 3, i32 6>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %vec2) {
+; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i64>, <8 x i64>* %vp
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 1, i32 7, i32 4, i32 4, i32 5>
+  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec2
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp) {
+; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i64>, <8 x i64>* %vp
+  %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 1, i32 7, i32 4, i32 4, i32 5>
+  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  ret <8 x i64> %res
+}
+
+define <8 x float> @test_8xfloat_perm_mask0(<8 x float> %vec) {
+; CHECK-LABEL: test_8xfloat_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4] sched: [1:0.50]
+; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
+  ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> %vec2) {
+; CHECK-LABEL: test_masked_8xfloat_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] sched: [1:0.50]
+; CHECK-NEXT:    movb $33, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
+  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0>, <8 x float> %shuf, <8 x float> %vec2
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mask0(<8 x float> %vec) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4] sched: [1:0.50]
+; CHECK-NEXT:    movb $33, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 3, i32 4, i32 2, i32 4, i32 1, i32 2, i32 3, i32 4>
+  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> %vec2) {
+; CHECK-LABEL: test_masked_8xfloat_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] sched: [1:0.50]
+; CHECK-NEXT:    movb $-34, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec2
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mask1(<8 x float> %vec) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [4,2,1,0,6,0,5,1] sched: [1:0.50]
+; CHECK-NEXT:    movb $-34, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 2, i32 1, i32 0, i32 6, i32 0, i32 5, i32 1>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> %vec2) {
+; CHECK-LABEL: test_masked_8xfloat_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] sched: [1:0.50]
+; CHECK-NEXT:    movb $-18, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec2
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mask2(<8 x float> %vec) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,5,5,4,6,0,5] sched: [1:0.50]
+; CHECK-NEXT:    movb $-18, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 2, i32 5, i32 5, i32 5, i32 4, i32 6, i32 0, i32 5>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_perm_mask3(<8 x float> %vec) {
+; CHECK-LABEL: test_8xfloat_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6] sched: [1:0.50]
+; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
+  ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> %vec2) {
+; CHECK-LABEL: test_masked_8xfloat_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] sched: [1:0.50]
+; CHECK-NEXT:    movb $82, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps %ymm0, %ymm2, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0>, <8 x float> %shuf, <8 x float> %vec2
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mask3(<8 x float> %vec) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6] sched: [1:0.50]
+; CHECK-NEXT:    movb $82, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 5, i32 2, i32 5, i32 5, i32 5, i32 1, i32 6>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_perm_mem_mask0(<8 x float>* %vp) {
+; CHECK-LABEL: test_8xfloat_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] sched: [1:0.50]
+; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x float>, <8 x float>* %vp
+  %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
+  ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x float> %vec2) {
+; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [1:0.50]
+; CHECK-NEXT:    movb $61, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x float>, <8 x float>* %vp
+  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x float> %shuf, <8 x float> %vec2
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mem_mask0(<8 x float>* %vp) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] sched: [1:0.50]
+; CHECK-NEXT:    movb $61, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x float>, <8 x float>* %vp
+  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 2, i32 1, i32 6, i32 4, i32 2, i32 4, i32 0>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x float> %vec2) {
+; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [1:0.50]
+; CHECK-NEXT:    movb $-124, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x float>, <8 x float>* %vp
+  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 4, i32 0, i32 6, i32 6, i32 6>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %vec2
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mem_mask1(<8 x float>* %vp) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [1,3,7,4,0,6,6,6] sched: [1:0.50]
+; CHECK-NEXT:    movb $-124, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x float>, <8 x float>* %vp
+  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 1, i32 3, i32 7, i32 4, i32 0, i32 6, i32 6, i32 6>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x float> %vec2) {
+; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [1:0.50]
+; CHECK-NEXT:    movb $-84, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x float>, <8 x float>* %vp
+  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 1, i32 5, i32 6, i32 6, i32 2, i32 4>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %vec2
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mem_mask2(<8 x float>* %vp) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [4,5,1,5,6,6,2,4] sched: [1:0.50]
+; CHECK-NEXT:    movb $-84, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x float>, <8 x float>* %vp
+  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 1, i32 5, i32 6, i32 6, i32 2, i32 4>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_perm_mem_mask3(<8 x float>* %vp) {
+; CHECK-LABEL: test_8xfloat_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] sched: [1:0.50]
+; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 # sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x float>, <8 x float>* %vp
+  %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
+  ret <8 x float> %res
+}
+define <8 x float> @test_masked_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x float> %vec2) {
+; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [1:0.50]
+; CHECK-NEXT:    movb $60, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps (%rdi), %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x float>, <8 x float>* %vp
+  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x float> %shuf, <8 x float> %vec2
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_masked_z_8xfloat_perm_mem_mask3(<8 x float>* %vp) {
+; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] sched: [1:0.50]
+; CHECK-NEXT:    movb $60, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps (%rdi), %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x float>, <8 x float>* %vp
+  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 5, i32 7, i32 0, i32 6, i32 4, i32 2, i32 3, i32 0>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+
+define <16 x float> @test_16xfloat_perm_mask0(<16 x float> %vec) {
+; CHECK-LABEL: test_16xfloat_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [5:0.50]
+; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
+  ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %vec2) {
+; CHECK-LABEL: test_masked_16xfloat_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [5:0.50]
+; CHECK-NEXT:    movw $14423, %ax # imm = 0x3857
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
+  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <16 x float> %shuf, <16 x float> %vec2
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [5:0.50]
+; CHECK-NEXT:    movw $14423, %ax # imm = 0x3857
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 7, i32 5, i32 13, i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7>
+  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x float> %vec2) {
+; CHECK-LABEL: test_masked_16xfloat_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [5:0.50]
+; CHECK-NEXT:    movw $-22757, %ax # imm = 0xA71B
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 4, i32 10, i32 4, i32 5, i32 8, i32 11, i32 2, i32 0, i32 10, i32 0, i32 0, i32 3, i32 10, i32 1>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec2
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mask1(<16 x float> %vec) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [5:0.50]
+; CHECK-NEXT:    movw $-22757, %ax # imm = 0xA71B
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 11, i32 10, i32 4, i32 10, i32 4, i32 5, i32 8, i32 11, i32 2, i32 0, i32 10, i32 0, i32 0, i32 3, i32 10, i32 1>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %vec2) {
+; CHECK-LABEL: test_masked_16xfloat_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [5:0.50]
+; CHECK-NEXT:    movw $-22227, %ax # imm = 0xA92D
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 15, i32 6, i32 14, i32 3, i32 6, i32 5, i32 2, i32 5, i32 15, i32 11, i32 6, i32 6, i32 4, i32 8, i32 11>
+  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec2
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [5:0.50]
+; CHECK-NEXT:    movw $-22227, %ax # imm = 0xA92D
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 15, i32 6, i32 14, i32 3, i32 6, i32 5, i32 2, i32 5, i32 15, i32 11, i32 6, i32 6, i32 4, i32 8, i32 11>
+  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_perm_mask3(<16 x float> %vec) {
+; CHECK-LABEL: test_16xfloat_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [5:0.50]
+; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
+  ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x float> %vec2) {
+; CHECK-LABEL: test_masked_16xfloat_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [5:0.50]
+; CHECK-NEXT:    movw $32420, %ax # imm = 0x7EA4
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
+  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0>, <16 x float> %shuf, <16 x float> %vec2
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mask3(<16 x float> %vec) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [5:0.50]
+; CHECK-NEXT:    movw $32420, %ax # imm = 0x7EA4
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 0, i32 14, i32 6, i32 6, i32 0, i32 2, i32 13, i32 8, i32 11, i32 2, i32 5, i32 13, i32 13, i32 3>
+  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_perm_mem_mask0(<16 x float>* %vp) {
+; CHECK-LABEL: test_16xfloat_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [5:0.50]
+; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x float>, <16 x float>* %vp
+  %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
+  ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x float> %vec2) {
+; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [5:0.50]
+; CHECK-NEXT:    movw $1441, %ax # imm = 0x5A1
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x float>, <16 x float>* %vp
+  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
+  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <16 x float> %shuf, <16 x float> %vec2
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(<16 x float>* %vp) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [5:0.50]
+; CHECK-NEXT:    movw $1441, %ax # imm = 0x5A1
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x float>, <16 x float>* %vp
+  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 2, i32 1, i32 14, i32 9, i32 9, i32 7, i32 2, i32 9, i32 4, i32 12, i32 11, i32 0, i32 14, i32 0, i32 1>
+  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x float> %vec2) {
+; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [5:0.50]
+; CHECK-NEXT:    movw $-12684, %ax # imm = 0xCE74
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x float>, <16 x float>* %vp
+  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 4, i32 2, i32 3, i32 5, i32 11, i32 6, i32 4, i32 7, i32 6, i32 4, i32 14, i32 8, i32 15, i32 12, i32 9, i32 4>
+  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec2
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(<16 x float>* %vp) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [5:0.50]
+; CHECK-NEXT:    movw $-12684, %ax # imm = 0xCE74
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x float>, <16 x float>* %vp
+  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 4, i32 2, i32 3, i32 5, i32 11, i32 6, i32 4, i32 7, i32 6, i32 4, i32 14, i32 8, i32 15, i32 12, i32 9, i32 4>
+  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x float> %vec2) {
+; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [5:0.50]
+; CHECK-NEXT:    movw $11066, %ax # imm = 0x2B3A
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x float>, <16 x float>* %vp
+  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 11, i32 6, i32 7, i32 0, i32 11, i32 0, i32 10, i32 9, i32 12, i32 4, i32 10, i32 3, i32 8, i32 5>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <16 x float> %shuf, <16 x float> %vec2
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(<16 x float>* %vp) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [5:0.50]
+; CHECK-NEXT:    movw $11066, %ax # imm = 0x2B3A
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x float>, <16 x float>* %vp
+  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 10, i32 7, i32 11, i32 6, i32 7, i32 0, i32 11, i32 0, i32 10, i32 9, i32 12, i32 4, i32 10, i32 3, i32 8, i32 5>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_perm_mem_mask3(<16 x float>* %vp) {
+; CHECK-LABEL: test_16xfloat_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [5:0.50]
+; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x float>, <16 x float>* %vp
+  %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
+  ret <16 x float> %res
+}
+define <16 x float> @test_masked_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x float> %vec2) {
+; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [5:0.50]
+; CHECK-NEXT:    movw $-13916, %ax # imm = 0xC9A4
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x float>, <16 x float>* %vp
+  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
+  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec2
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_masked_z_16xfloat_perm_mem_mask3(<16 x float>* %vp) {
+; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [5:0.50]
+; CHECK-NEXT:    movw $-13916, %ax # imm = 0xC9A4
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermps (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x float>, <16 x float>* %vp
+  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 15, i32 15, i32 3, i32 9, i32 5, i32 15, i32 14, i32 9, i32 11, i32 10, i32 5, i32 14, i32 14, i32 5, i32 11, i32 0>
+  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+
+define <4 x double> @test_4xdouble_perm_mask0(<4 x double> %vec) {
+; CHECK-LABEL: test_4xdouble_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,2] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
+  ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_perm_mask0(<4 x double> %vec, <4 x double> %vec2) {
+; CHECK-LABEL: test_masked_4xdouble_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2]
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
+  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec2
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mask0(<4 x double> %vec) {
+; CHECK-LABEL: test_masked_z_4xdouble_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 3, i32 2>
+  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_perm_mask1(<4 x double> %vec, <4 x double> %vec2) {
+; CHECK-LABEL: test_masked_4xdouble_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $8, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0]
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
+  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec2
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mask1(<4 x double> %vec) {
+; CHECK-LABEL: test_masked_z_4xdouble_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $8, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
+  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_perm_mask2(<4 x double> %vec, <4 x double> %vec2) {
+; CHECK-LABEL: test_masked_4xdouble_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1]
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec2
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mask2(<4 x double> %vec) {
+; CHECK-LABEL: test_masked_z_4xdouble_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 3, i32 3, i32 1>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_perm_mask3(<4 x double> %vec) {
+; CHECK-LABEL: test_4xdouble_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,2] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
+  ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_perm_mask3(<4 x double> %vec, <4 x double> %vec2) {
+; CHECK-LABEL: test_masked_4xdouble_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $4, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2]
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
+  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec2
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mask3(<4 x double> %vec) {
+; CHECK-LABEL: test_masked_z_4xdouble_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $4, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 2>
+  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_perm_mem_mask0(<4 x double>* %vp) {
+; CHECK-LABEL: test_4xdouble_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = mem[0,0,2,0] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x double>, <4 x double>* %vp
+  %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
+  ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x double> %vec2) {
+; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $3, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x double>, <4 x double>* %vp
+  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
+  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 0>, <4 x double> %shuf, <4 x double> %vec2
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mem_mask0(<4 x double>* %vp) {
+; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $3, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x double>, <4 x double>* %vp
+  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
+  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x double> %vec2) {
+; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $8, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x double>, <4 x double>* %vp
+  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 2>
+  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec2
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mem_mask1(<4 x double>* %vp) {
+; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $8, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x double>, <4 x double>* %vp
+  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 2, i32 3, i32 2>
+  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x double> %vec2) {
+; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x double>, <4 x double>* %vp
+  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec2
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mem_mask2(<4 x double>* %vp) {
+; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x double>, <4 x double>* %vp
+  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 1, i32 1, i32 1>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_perm_mem_mask3(<4 x double>* %vp) {
+; CHECK-LABEL: test_4xdouble_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = mem[3,2,3,2] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x double>, <4 x double>* %vp
+  %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
+  ret <4 x double> %res
+}
+define <4 x double> @test_masked_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x double> %vec2) {
+; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x double>, <4 x double>* %vp
+  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
+  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec2
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_masked_z_4xdouble_perm_mem_mask3(<4 x double>* %vp) {
+; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x double>, <4 x double>* %vp
+  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
+  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+
+define <8 x double> @test_8xdouble_perm_mask0(<8 x double> %vec) {
+; CHECK-LABEL: test_8xdouble_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] sched: [5:0.50]
+; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
+  ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x double> %vec2) {
+; CHECK-LABEL: test_masked_8xdouble_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [5:0.50]
+; CHECK-NEXT:    movb $-115, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <8 x double> %shuf, <8 x double> %vec2
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mask0(<8 x double> %vec) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] sched: [5:0.50]
+; CHECK-NEXT:    movb $-115, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 5, i32 7, i32 4, i32 2, i32 7, i32 4, i32 3, i32 4>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x double> %vec2) {
+; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-4, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6]
+; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec2
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mask1(<8 x double> %vec) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-4, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 0, i32 0, i32 2, i32 7, i32 4, i32 4, i32 6>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x double> %vec2) {
+; CHECK-LABEL: test_masked_8xdouble_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [5:0.50]
+; CHECK-NEXT:    movb $49, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
+  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec2
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mask2(<8 x double> %vec) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [7,5,5,5,3,5,1,7] sched: [5:0.50]
+; CHECK-NEXT:    movb $49, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 7, i32 5, i32 5, i32 5, i32 3, i32 5, i32 1, i32 7>
+  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_perm_imm_mask3(<8 x double> %vec) {
+; CHECK-LABEL: test_8xdouble_perm_imm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = zmm0[1,3,3,0,5,7,7,4]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
+  ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x double> %vec2) {
+; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-57, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4]
+; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec2
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mask3(<8 x double> %vec) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-57, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x double> %vec2) {
+; CHECK-LABEL: test_masked_8xdouble_perm_mask4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [5:0.50]
+; CHECK-NEXT:    movb $-54, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec2
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mask4(<8 x double> %vec) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_mask4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [3,5,3,4,6,5,7,1] sched: [5:0.50]
+; CHECK-NEXT:    movb $-54, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 5, i32 3, i32 4, i32 6, i32 5, i32 7, i32 1>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x double> %vec2) {
+; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-41, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7]
+; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec2
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mask5(<8 x double> %vec) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-41, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_perm_mask6(<8 x double> %vec) {
+; CHECK-LABEL: test_8xdouble_perm_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] sched: [5:0.50]
+; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
+  ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x double> %vec2) {
+; CHECK-LABEL: test_masked_8xdouble_perm_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [5:0.50]
+; CHECK-NEXT:    movb $-65, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1}
+; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> %vec2
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mask6(<8 x double> %vec) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] sched: [5:0.50]
+; CHECK-NEXT:    movb $-65, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 7, i32 6, i32 4, i32 0, i32 0, i32 0, i32 2>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x double> %vec2) {
+; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $40, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6]
+; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
+  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec2
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mask7(<8 x double> %vec) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $40, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 3, i32 1, i32 3, i32 2, i32 7, i32 5, i32 7, i32 6>
+  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_perm_mem_mask0(<8 x double>* %vp) {
+; CHECK-LABEL: test_8xdouble_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [5:0.50]
+; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x double>, <8 x double>* %vp
+  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
+  ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x double> %vec2) {
+; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [5:0.50]
+; CHECK-NEXT:    movb $99, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x double>, <8 x double>* %vp
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
+  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0>, <8 x double> %shuf, <8 x double> %vec2
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mem_mask0(<8 x double>* %vp) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [5:0.50]
+; CHECK-NEXT:    movb $99, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x double>, <8 x double>* %vp
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 4, i32 0, i32 4, i32 2, i32 0, i32 1>
+  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x double> %vec2) {
+; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-32, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x double>, <8 x double>* %vp
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 2, i32 0, i32 3, i32 4, i32 6, i32 4, i32 7>
+  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec2
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-32, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x double>, <8 x double>* %vp
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 2, i32 0, i32 3, i32 4, i32 6, i32 4, i32 7>
+  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x double> %vec2) {
+; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [5:0.50]
+; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x double>, <8 x double>* %vp
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 6, i32 7, i32 2, i32 7, i32 7, i32 6, i32 2, i32 5>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec2
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mem_mask2(<8 x double>* %vp) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm0 = [6,7,2,7,7,6,2,5] sched: [5:0.50]
+; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x double>, <8 x double>* %vp
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 6, i32 7, i32 2, i32 7, i32 7, i32 6, i32 2, i32 5>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp) {
+; CHECK-LABEL: test_8xdouble_perm_imm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x double>, <8 x double>* %vp
+  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
+  ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x double> %vec2) {
+; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $119, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x double>, <8 x double>* %vp
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0>, <8 x double> %shuf, <8 x double> %vec2
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $119, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x double>, <8 x double>* %vp
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x double> %vec2) {
+; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [5:0.50]
+; CHECK-NEXT:    movb $-45, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x double>, <8 x double>* %vp
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 5, i32 6, i32 0, i32 6, i32 0>
+  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec2
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mem_mask4(<8 x double>* %vp) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm0 = [1,1,3,5,6,0,6,0] sched: [5:0.50]
+; CHECK-NEXT:    movb $-45, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x double>, <8 x double>* %vp
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 5, i32 6, i32 0, i32 6, i32 0>
+  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x double> %vec2) {
+; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $33, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x double>, <8 x double>* %vp
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 3, i32 6, i32 6, i32 6, i32 7>
+  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec2
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $33, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x double>, <8 x double>* %vp
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 3, i32 6, i32 6, i32 6, i32 7>
+  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_perm_mem_mask6(<8 x double>* %vp) {
+; CHECK-LABEL: test_8xdouble_perm_mem_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [5:0.50]
+; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x double>, <8 x double>* %vp
+  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
+  ret <8 x double> %res
+}
+define <8 x double> @test_masked_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x double> %vec2) {
+; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [5:0.50]
+; CHECK-NEXT:    movb $-75, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x double>, <8 x double>* %vp
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> %vec2
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_mem_mask6(<8 x double>* %vp) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [5:0.50]
+; CHECK-NEXT:    movb $-75, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x double>, <8 x double>* %vp
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 2, i32 4, i32 0, i32 4, i32 6, i32 1, i32 2, i32 5>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x double> %vec2) {
+; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $84, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x double>, <8 x double>* %vp
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <8 x double> %shuf, <8 x double> %vec2
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp) {
+; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $84, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x double>, <8 x double>* %vp
+  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+
+define <16 x i8> @test_16xi8_perm_mask0(<16 x i8> %vec) {
+; CHECK-LABEL: test_16xi8_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
+  ret <16 x i8> %res
+}
+define <16 x i8> @test_masked_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %vec2) {
+; CHECK-LABEL: test_masked_16xi8_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-10197, %ax # imm = 0xD82B
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1>, <16 x i8> %shuf, <16 x i8> %vec2
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_16xi8_perm_mask0(<16 x i8> %vec) {
+; CHECK-LABEL: test_masked_z_16xi8_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-10197, %ax # imm = 0xD82B
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 8, i32 6, i32 12, i32 4, i32 7, i32 9, i32 14, i32 8, i32 4, i32 12, i32 9, i32 4, i32 14, i32 15, i32 12, i32 14>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1>, <16 x i8> %shuf, <16 x i8> zeroinitializer
+  ret <16 x i8> %res
+}
+define <16 x i8> @test_masked_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %vec2) {
+; CHECK-LABEL: test_masked_16xi8_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-15864, %ax # imm = 0xC208
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
+  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1>, <16 x i8> %shuf, <16 x i8> %vec2
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_16xi8_perm_mask1(<16 x i8> %vec) {
+; CHECK-LABEL: test_masked_z_16xi8_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-15864, %ax # imm = 0xC208
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 4, i32 11, i32 14, i32 10, i32 7, i32 1, i32 6, i32 9, i32 14, i32 15, i32 7, i32 13, i32 4, i32 12, i32 8, i32 0>
+  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1>, <16 x i8> %shuf, <16 x i8> zeroinitializer
+  ret <16 x i8> %res
+}
+define <16 x i8> @test_masked_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %vec2) {
+; CHECK-LABEL: test_masked_16xi8_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $27562, %ax # imm = 0x6BAA
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <16 x i8> %shuf, <16 x i8> %vec2
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_16xi8_perm_mask2(<16 x i8> %vec) {
+; CHECK-LABEL: test_masked_z_16xi8_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $27562, %ax # imm = 0x6BAA
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 11, i32 6, i32 13, i32 10, i32 0, i32 7, i32 13, i32 3, i32 5, i32 13, i32 3, i32 9, i32 3, i32 15, i32 12, i32 7>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <16 x i8> %shuf, <16 x i8> zeroinitializer
+  ret <16 x i8> %res
+}
+define <16 x i8> @test_16xi8_perm_mask3(<16 x i8> %vec) {
+; CHECK-LABEL: test_16xi8_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
+  ret <16 x i8> %res
+}
+define <16 x i8> @test_masked_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %vec2) {
+; CHECK-LABEL: test_masked_16xi8_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $16968, %ax # imm = 0x4248
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
+  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <16 x i8> %shuf, <16 x i8> %vec2
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_16xi8_perm_mask3(<16 x i8> %vec) {
+; CHECK-LABEL: test_masked_z_16xi8_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $16968, %ax # imm = 0x4248
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 5, i32 8, i32 14, i32 1, i32 8, i32 11, i32 8, i32 13, i32 8, i32 15, i32 9, i32 9, i32 7, i32 9, i32 6>
+  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <16 x i8> %shuf, <16 x i8> zeroinitializer
+  ret <16 x i8> %res
+}
+define <16 x i8> @test_16xi8_perm_mem_mask0(<16 x i8>* %vp) {
+; CHECK-LABEL: test_16xi8_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [1:0.50]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i8>, <16 x i8>* %vp
+  %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
+  ret <16 x i8> %res
+}
+define <16 x i8> @test_masked_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %vec2) {
+; CHECK-LABEL: test_masked_16xi8_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    movw $-27811, %ax # imm = 0x935D
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i8>, <16 x i8>* %vp
+  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
+  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1>, <16 x i8> %shuf, <16 x i8> %vec2
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_16xi8_perm_mem_mask0(<16 x i8>* %vp) {
+; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [1:0.50]
+; CHECK-NEXT:    movw $-27811, %ax # imm = 0x935D
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i8>, <16 x i8>* %vp
+  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 10, i32 7, i32 1, i32 12, i32 14, i32 14, i32 13, i32 14, i32 14, i32 8, i32 6, i32 11, i32 4, i32 12, i32 13>
+  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1>, <16 x i8> %shuf, <16 x i8> zeroinitializer
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %vec2) {
+; CHECK-LABEL: test_masked_16xi8_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    movw $19027, %ax # imm = 0x4A53
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i8>, <16 x i8>* %vp
+  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <16 x i8> %shuf, <16 x i8> %vec2
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_16xi8_perm_mem_mask1(<16 x i8>* %vp) {
+; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [1:0.50]
+; CHECK-NEXT:    movw $19027, %ax # imm = 0x4A53
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i8>, <16 x i8>* %vp
+  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 14, i32 9, i32 15, i32 9, i32 7, i32 10, i32 15, i32 14, i32 12, i32 1, i32 9, i32 7, i32 10, i32 13, i32 3, i32 11>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <16 x i8> %shuf, <16 x i8> zeroinitializer
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %vec2) {
+; CHECK-LABEL: test_masked_16xi8_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    movw $12412, %ax # imm = 0x307C
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i8>, <16 x i8>* %vp
+  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
+  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <16 x i8> %shuf, <16 x i8> %vec2
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_16xi8_perm_mem_mask2(<16 x i8>* %vp) {
+; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [1:0.50]
+; CHECK-NEXT:    movw $12412, %ax # imm = 0x307C
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i8>, <16 x i8>* %vp
+  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 12, i32 5, i32 13, i32 1, i32 2, i32 11, i32 0, i32 9, i32 14, i32 8, i32 10, i32 0, i32 10, i32 9>
+  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <16 x i8> %shuf, <16 x i8> zeroinitializer
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_16xi8_perm_mem_mask3(<16 x i8>* %vp) {
+; CHECK-LABEL: test_16xi8_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [1:0.50]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i8>, <16 x i8>* %vp
+  %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
+  ret <16 x i8> %res
+}
+define <16 x i8> @test_masked_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %vec2) {
+; CHECK-LABEL: test_masked_16xi8_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm1 # sched: [1:0.50]
+; CHECK-NEXT:    movw $12238, %ax # imm = 0x2FCE
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i8>, <16 x i8>* %vp
+  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
+  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <16 x i8> %shuf, <16 x i8> %vec2
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_masked_z_16xi8_perm_mem_mask3(<16 x i8>* %vp) {
+; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa (%rdi), %xmm0 # sched: [1:0.50]
+; CHECK-NEXT:    movw $12238, %ax # imm = 0x2FCE
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i8>, <16 x i8>* %vp
+  %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> <i32 9, i32 6, i32 5, i32 15, i32 0, i32 0, i32 15, i32 2, i32 1, i32 3, i32 12, i32 14, i32 0, i32 6, i32 1, i32 4>
+  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <16 x i8> %shuf, <16 x i8> zeroinitializer
+  ret <16 x i8> %res
+}
+
+define <32 x i8> @test_32xi8_perm_mask0(<32 x i8> %vec) {
+; CHECK-LABEL: test_32xi8_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
+  ret <32 x i8> %res
+}
+define <32 x i8> @test_masked_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %vec2) {
+; CHECK-LABEL: test_masked_32xi8_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $948454498, %eax # imm = 0x38884462
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
+  %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <32 x i8> %shuf, <32 x i8> %vec2
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_32xi8_perm_mask0(<32 x i8> %vec) {
+; CHECK-LABEL: test_masked_z_32xi8_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $948454498, %eax # imm = 0x38884462
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 8, i32 0, i32 1, i32 15, i32 3, i32 5, i32 11, i32 13, i32 14, i32 2, i32 10, i32 15, i32 0, i32 10, i32 13, i32 5, i32 20, i32 25, i32 23, i32 18, i32 23, i32 22, i32 25, i32 24, i32 20, i32 21, i32 29, i32 20, i32 24, i32 16, i32 27, i32 21>
+  %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <32 x i8> %shuf, <32 x i8> zeroinitializer
+  ret <32 x i8> %res
+}
+define <32 x i8> @test_masked_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %vec2) {
+; CHECK-LABEL: test_masked_32xi8_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $-1516442487, %eax # imm = 0xA59CEC89
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24>
+  %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <32 x i8> %shuf, <32 x i8> %vec2
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_32xi8_perm_mask1(<32 x i8> %vec) {
+; CHECK-LABEL: test_masked_z_32xi8_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $-1516442487, %eax # imm = 0xA59CEC89
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 0, i32 4, i32 3, i32 15, i32 5, i32 4, i32 5, i32 15, i32 10, i32 9, i32 11, i32 6, i32 6, i32 10, i32 0, i32 3, i32 21, i32 19, i32 26, i32 22, i32 30, i32 25, i32 22, i32 22, i32 27, i32 22, i32 26, i32 16, i32 23, i32 20, i32 18, i32 24>
+  %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <32 x i8> %shuf, <32 x i8> zeroinitializer
+  ret <32 x i8> %res
+}
+define <32 x i8> @test_masked_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %vec2) {
+; CHECK-LABEL: test_masked_32xi8_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $1504501134, %eax # imm = 0x59ACDD8E
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29>
+  %res = select <32 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <32 x i8> %shuf, <32 x i8> %vec2
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_32xi8_perm_mask2(<32 x i8> %vec) {
+; CHECK-LABEL: test_masked_z_32xi8_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $1504501134, %eax # imm = 0x59ACDD8E
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 7, i32 8, i32 12, i32 14, i32 7, i32 4, i32 7, i32 12, i32 14, i32 12, i32 3, i32 15, i32 10, i32 1, i32 11, i32 15, i32 22, i32 26, i32 21, i32 19, i32 27, i32 16, i32 29, i32 24, i32 17, i32 17, i32 26, i32 29, i32 20, i32 31, i32 17, i32 29>
+  %res = select <32 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <32 x i8> %shuf, <32 x i8> zeroinitializer
+  ret <32 x i8> %res
+}
+define <32 x i8> @test_32xi8_perm_mask3(<32 x i8> %vec) {
+; CHECK-LABEL: test_32xi8_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
+  ret <32 x i8> %res
+}
+define <32 x i8> @test_masked_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %vec2) {
+; CHECK-LABEL: test_masked_32xi8_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $774459490, %eax # imm = 0x2E295062
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
+  %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <32 x i8> %shuf, <32 x i8> %vec2
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_32xi8_perm_mask3(<32 x i8> %vec) {
+; CHECK-LABEL: test_masked_z_32xi8_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $774459490, %eax # imm = 0x2E295062
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 6, i32 1, i32 4, i32 7, i32 12, i32 13, i32 2, i32 8, i32 10, i32 5, i32 13, i32 4, i32 0, i32 0, i32 10, i32 8, i32 31, i32 31, i32 30, i32 16, i32 27, i32 27, i32 26, i32 27, i32 30, i32 26, i32 21, i32 24, i32 19, i32 25, i32 16, i32 18>
+  %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <32 x i8> %shuf, <32 x i8> zeroinitializer
+  ret <32 x i8> %res
+}
+define <32 x i8> @test_32xi8_perm_mem_mask0(<32 x i8>* %vp) {
+; CHECK-LABEL: test_32xi8_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [1:0.50]
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i8>, <32 x i8>* %vp
+  %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
+  ret <32 x i8> %res
+}
+define <32 x i8> @test_masked_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %vec2) {
+; CHECK-LABEL: test_masked_32xi8_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [1:0.50]
+; CHECK-NEXT:    movl $1431978123, %eax # imm = 0x555A408B
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i8>, <32 x i8>* %vp
+  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
+  %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <32 x i8> %shuf, <32 x i8> %vec2
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_32xi8_perm_mem_mask0(<32 x i8>* %vp) {
+; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [1:0.50]
+; CHECK-NEXT:    movl $1431978123, %eax # imm = 0x555A408B
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i8>, <32 x i8>* %vp
+  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 9, i32 0, i32 2, i32 15, i32 4, i32 6, i32 8, i32 4, i32 7, i32 3, i32 0, i32 2, i32 8, i32 1, i32 6, i32 5, i32 22, i32 17, i32 30, i32 23, i32 29, i32 31, i32 21, i32 23, i32 27, i32 22, i32 20, i32 27, i32 30, i32 30, i32 26, i32 22>
+  %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <32 x i8> %shuf, <32 x i8> zeroinitializer
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %vec2) {
+; CHECK-LABEL: test_masked_32xi8_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [1:0.50]
+; CHECK-NEXT:    movl $-903561653, %eax # imm = 0xCA24BE4B
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i8>, <32 x i8>* %vp
+  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 15, i32 10, i32 1, i32 1, i32 11, i32 0, i32 0, i32 6, i32 8, i32 7, i32 7, i32 9, i32 10, i32 6, i32 5, i32 15, i32 20, i32 28, i32 22, i32 21, i32 17, i32 29, i32 27, i32 30, i32 23, i32 26, i32 17, i32 22, i32 19, i32 16, i32 31, i32 19>
+  %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <32 x i8> %shuf, <32 x i8> %vec2
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_32xi8_perm_mem_mask1(<32 x i8>* %vp) {
+; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [1:0.50]
+; CHECK-NEXT:    movl $-903561653, %eax # imm = 0xCA24BE4B
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i8>, <32 x i8>* %vp
+  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 15, i32 10, i32 1, i32 1, i32 11, i32 0, i32 0, i32 6, i32 8, i32 7, i32 7, i32 9, i32 10, i32 6, i32 5, i32 15, i32 20, i32 28, i32 22, i32 21, i32 17, i32 29, i32 27, i32 30, i32 23, i32 26, i32 17, i32 22, i32 19, i32 16, i32 31, i32 19>
+  %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1>, <32 x i8> %shuf, <32 x i8> zeroinitializer
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %vec2) {
+; CHECK-LABEL: test_masked_32xi8_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [1:0.50]
+; CHECK-NEXT:    movl $-1209035774, %eax # imm = 0xB7EF9402
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i8>, <32 x i8>* %vp
+  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 6, i32 8, i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7, i32 14, i32 5, i32 7, i32 7, i32 26, i32 19, i32 25, i32 19, i32 21, i32 31, i32 30, i32 29, i32 16, i32 18, i32 20, i32 28, i32 29, i32 25, i32 27, i32 28>
+  %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <32 x i8> %shuf, <32 x i8> %vec2
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_32xi8_perm_mem_mask2(<32 x i8>* %vp) {
+; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [1:0.50]
+; CHECK-NEXT:    movl $-1209035774, %eax # imm = 0xB7EF9402
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i8>, <32 x i8>* %vp
+  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 2, i32 3, i32 6, i32 8, i32 2, i32 15, i32 15, i32 2, i32 6, i32 10, i32 14, i32 7, i32 14, i32 5, i32 7, i32 7, i32 26, i32 19, i32 25, i32 19, i32 21, i32 31, i32 30, i32 29, i32 16, i32 18, i32 20, i32 28, i32 29, i32 25, i32 27, i32 28>
+  %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <32 x i8> %shuf, <32 x i8> zeroinitializer
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_32xi8_perm_mem_mask3(<32 x i8>* %vp) {
+; CHECK-LABEL: test_32xi8_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [1:0.50]
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i8>, <32 x i8>* %vp
+  %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
+  ret <32 x i8> %res
+}
+define <32 x i8> @test_masked_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %vec2) {
+; CHECK-LABEL: test_masked_32xi8_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm1 # sched: [1:0.50]
+; CHECK-NEXT:    movl $1452798329, %eax # imm = 0x5697F179
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i8>, <32 x i8>* %vp
+  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
+  %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <32 x i8> %shuf, <32 x i8> %vec2
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_masked_z_32xi8_perm_mem_mask3(<32 x i8>* %vp) {
+; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa (%rdi), %ymm0 # sched: [1:0.50]
+; CHECK-NEXT:    movl $1452798329, %eax # imm = 0x5697F179
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i8>, <32 x i8>* %vp
+  %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> <i32 1, i32 1, i32 13, i32 0, i32 3, i32 0, i32 0, i32 13, i32 5, i32 2, i32 2, i32 10, i32 15, i32 8, i32 14, i32 8, i32 25, i32 26, i32 28, i32 28, i32 31, i32 27, i32 30, i32 19, i32 24, i32 25, i32 29, i32 23, i32 28, i32 22, i32 25, i32 29>
+  %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <32 x i8> %shuf, <32 x i8> zeroinitializer
+  ret <32 x i8> %res
+}
+
+define <64 x i8> @test_64xi8_perm_mask0(<64 x i8> %vec) {
+; CHECK-LABEL: test_64xi8_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
+  ret <64 x i8> %res
+}
+define <64 x i8> @test_masked_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %vec2) {
+; CHECK-LABEL: test_masked_64xi8_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movabsq $3680399704764602881, %rax # imm = 0x3313680829F25A01
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovq %rax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
+  %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> %vec2
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_64xi8_perm_mask0(<64 x i8> %vec) {
+; CHECK-LABEL: test_masked_z_64xi8_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movabsq $3680399704764602881, %rax # imm = 0x3313680829F25A01
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovq %rax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 8, i32 4, i32 1, i32 13, i32 15, i32 4, i32 6, i32 12, i32 0, i32 10, i32 2, i32 4, i32 13, i32 0, i32 0, i32 6, i32 23, i32 29, i32 27, i32 26, i32 18, i32 31, i32 22, i32 25, i32 22, i32 16, i32 23, i32 18, i32 16, i32 25, i32 26, i32 17, i32 40, i32 37, i32 38, i32 44, i32 39, i32 46, i32 41, i32 39, i32 42, i32 37, i32 33, i32 42, i32 41, i32 44, i32 34, i32 46, i32 60, i32 62, i32 61, i32 58, i32 60, i32 56, i32 60, i32 51, i32 60, i32 55, i32 60, i32 55, i32 60, i32 49, i32 48, i32 62>
+  %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> zeroinitializer
+  ret <64 x i8> %res
+}
+define <64 x i8> @test_masked_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %vec2) {
+; CHECK-LABEL: test_masked_64xi8_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movabsq $3029806472256067585, %rax # imm = 0x2A0C08EF15009801
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovq %rax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
+  %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> %vec2
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_64xi8_perm_mask1(<64 x i8> %vec) {
+; CHECK-LABEL: test_masked_z_64xi8_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movabsq $3029806472256067585, %rax # imm = 0x2A0C08EF15009801
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovq %rax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 7, i32 14, i32 15, i32 10, i32 9, i32 3, i32 1, i32 13, i32 14, i32 12, i32 11, i32 6, i32 4, i32 1, i32 6, i32 9, i32 30, i32 30, i32 22, i32 17, i32 28, i32 27, i32 16, i32 23, i32 26, i32 16, i32 30, i32 31, i32 27, i32 17, i32 17, i32 21, i32 32, i32 37, i32 32, i32 47, i32 45, i32 33, i32 46, i32 35, i32 35, i32 42, i32 47, i32 33, i32 32, i32 37, i32 32, i32 41, i32 61, i32 50, i32 49, i32 53, i32 63, i32 50, i32 63, i32 53, i32 55, i32 52, i32 62, i32 63, i32 58, i32 50, i32 63, i32 49>
+  %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> zeroinitializer
+  ret <64 x i8> %res
+}
+define <64 x i8> @test_masked_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %vec2) {
+; CHECK-LABEL: test_masked_64xi8_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movabsq $1110016799796225, %rax # imm = 0x3F18DED0BEC01
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovq %rax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
+  %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> %vec2
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_64xi8_perm_mask2(<64 x i8> %vec) {
+; CHECK-LABEL: test_masked_z_64xi8_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movabsq $1110016799796225, %rax # imm = 0x3F18DED0BEC01
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovq %rax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 9, i32 2, i32 14, i32 15, i32 12, i32 5, i32 3, i32 12, i32 4, i32 6, i32 0, i32 2, i32 0, i32 1, i32 1, i32 6, i32 24, i32 27, i32 18, i32 22, i32 26, i32 17, i32 23, i32 21, i32 31, i32 16, i32 22, i32 22, i32 27, i32 21, i32 19, i32 20, i32 39, i32 47, i32 44, i32 36, i32 40, i32 43, i32 44, i32 39, i32 38, i32 44, i32 38, i32 35, i32 39, i32 46, i32 34, i32 39, i32 58, i32 55, i32 51, i32 48, i32 59, i32 57, i32 48, i32 52, i32 60, i32 58, i32 56, i32 50, i32 59, i32 55, i32 58, i32 60>
+  %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> zeroinitializer
+  ret <64 x i8> %res
+}
+define <64 x i8> @test_64xi8_perm_mask3(<64 x i8> %vec) {
+; CHECK-LABEL: test_64xi8_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
+  ret <64 x i8> %res
+}
+define <64 x i8> @test_masked_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %vec2) {
+; CHECK-LABEL: test_masked_64xi8_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movabsq $839183534234450945, %rax # imm = 0xBA560FA6B66BC01
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovq %rax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
+  %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> %vec2
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_64xi8_perm_mask3(<64 x i8> %vec) {
+; CHECK-LABEL: test_masked_z_64xi8_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movabsq $839183534234450945, %rax # imm = 0xBA560FA6B66BC01
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovq %rax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 3, i32 12, i32 4, i32 15, i32 1, i32 14, i32 0, i32 4, i32 8, i32 9, i32 6, i32 1, i32 4, i32 4, i32 12, i32 14, i32 25, i32 16, i32 28, i32 20, i32 21, i32 24, i32 19, i32 30, i32 18, i32 22, i32 20, i32 24, i32 25, i32 26, i32 24, i32 22, i32 42, i32 38, i32 44, i32 44, i32 36, i32 37, i32 42, i32 34, i32 43, i32 38, i32 41, i32 34, i32 42, i32 37, i32 39, i32 38, i32 55, i32 59, i32 53, i32 58, i32 48, i32 52, i32 59, i32 48, i32 57, i32 48, i32 55, i32 62, i32 48, i32 56, i32 49, i32 61>
+  %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> zeroinitializer
+  ret <64 x i8> %res
+}
+define <64 x i8> @test_64xi8_perm_mem_mask0(<64 x i8>* %vp) {
+; CHECK-LABEL: test_64xi8_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [5:0.50]
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <64 x i8>, <64 x i8>* %vp
+  %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
+  ret <64 x i8> %res
+}
+define <64 x i8> @test_masked_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %vec2) {
+; CHECK-LABEL: test_masked_64xi8_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [5:0.50]
+; CHECK-NEXT:    movabsq $3164984076108002305, %rax # imm = 0x2BEC483F982F7401
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovq %rax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <64 x i8>, <64 x i8>* %vp
+  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
+  %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> %vec2
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_64xi8_perm_mem_mask0(<64 x i8>* %vp) {
+; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [5:0.50]
+; CHECK-NEXT:    movabsq $3164984076108002305, %rax # imm = 0x2BEC483F982F7401
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovq %rax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <64 x i8>, <64 x i8>* %vp
+  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 0, i32 9, i32 15, i32 13, i32 11, i32 11, i32 3, i32 12, i32 4, i32 1, i32 7, i32 5, i32 2, i32 6, i32 14, i32 6, i32 23, i32 27, i32 24, i32 18, i32 30, i32 23, i32 28, i32 22, i32 28, i32 22, i32 19, i32 19, i32 31, i32 25, i32 16, i32 22, i32 35, i32 33, i32 34, i32 32, i32 42, i32 34, i32 41, i32 41, i32 43, i32 40, i32 36, i32 46, i32 37, i32 39, i32 42, i32 40, i32 63, i32 63, i32 62, i32 62, i32 57, i32 55, i32 59, i32 51, i32 52, i32 48, i32 50, i32 48, i32 58, i32 50, i32 60, i32 58>
+  %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> zeroinitializer
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %vec2) {
+; CHECK-LABEL: test_masked_64xi8_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [5:0.50]
+; CHECK-NEXT:    movabsq $3421658227176024577, %rax # imm = 0x2F7C2C07659EAA01
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovq %rax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <64 x i8>, <64 x i8>* %vp
+  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 15, i32 6, i32 14, i32 7, i32 5, i32 1, i32 14, i32 12, i32 5, i32 7, i32 5, i32 0, i32 0, i32 5, i32 3, i32 8, i32 19, i32 19, i32 26, i32 27, i32 20, i32 29, i32 20, i32 21, i32 27, i32 16, i32 30, i32 17, i32 23, i32 27, i32 16, i32 28, i32 47, i32 39, i32 33, i32 33, i32 33, i32 44, i32 38, i32 46, i32 39, i32 33, i32 38, i32 44, i32 45, i32 32, i32 34, i32 39, i32 50, i32 61, i32 62, i32 53, i32 54, i32 56, i32 52, i32 56, i32 51, i32 52, i32 55, i32 57, i32 56, i32 52, i32 51, i32 49>
+  %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> %vec2
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_64xi8_perm_mem_mask1(<64 x i8>* %vp) {
+; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [5:0.50]
+; CHECK-NEXT:    movabsq $3421658227176024577, %rax # imm = 0x2F7C2C07659EAA01
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovq %rax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <64 x i8>, <64 x i8>* %vp
+  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 15, i32 6, i32 14, i32 7, i32 5, i32 1, i32 14, i32 12, i32 5, i32 7, i32 5, i32 0, i32 0, i32 5, i32 3, i32 8, i32 19, i32 19, i32 26, i32 27, i32 20, i32 29, i32 20, i32 21, i32 27, i32 16, i32 30, i32 17, i32 23, i32 27, i32 16, i32 28, i32 47, i32 39, i32 33, i32 33, i32 33, i32 44, i32 38, i32 46, i32 39, i32 33, i32 38, i32 44, i32 45, i32 32, i32 34, i32 39, i32 50, i32 61, i32 62, i32 53, i32 54, i32 56, i32 52, i32 56, i32 51, i32 52, i32 55, i32 57, i32 56, i32 52, i32 51, i32 49>
+  %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> zeroinitializer
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %vec2) {
+; CHECK-LABEL: test_masked_64xi8_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [5:0.50]
+; CHECK-NEXT:    movabsq $3085252902658394625, %rax # imm = 0x2AD1052B29324A01
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovq %rax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <64 x i8>, <64 x i8>* %vp
+  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 12, i32 1, i32 11, i32 3, i32 4, i32 11, i32 10, i32 11, i32 8, i32 13, i32 1, i32 10, i32 1, i32 11, i32 5, i32 10, i32 27, i32 26, i32 19, i32 29, i32 19, i32 24, i32 26, i32 19, i32 26, i32 20, i32 18, i32 28, i32 24, i32 21, i32 25, i32 16, i32 34, i32 38, i32 47, i32 40, i32 33, i32 44, i32 44, i32 44, i32 41, i32 43, i32 35, i32 43, i32 45, i32 44, i32 37, i32 41, i32 58, i32 62, i32 49, i32 61, i32 56, i32 53, i32 55, i32 48, i32 51, i32 58, i32 58, i32 55, i32 63, i32 55, i32 53, i32 61>
+  %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> %vec2
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_64xi8_perm_mem_mask2(<64 x i8>* %vp) {
+; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [5:0.50]
+; CHECK-NEXT:    movabsq $3085252902658394625, %rax # imm = 0x2AD1052B29324A01
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovq %rax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <64 x i8>, <64 x i8>* %vp
+  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 12, i32 1, i32 11, i32 3, i32 4, i32 11, i32 10, i32 11, i32 8, i32 13, i32 1, i32 10, i32 1, i32 11, i32 5, i32 10, i32 27, i32 26, i32 19, i32 29, i32 19, i32 24, i32 26, i32 19, i32 26, i32 20, i32 18, i32 28, i32 24, i32 21, i32 25, i32 16, i32 34, i32 38, i32 47, i32 40, i32 33, i32 44, i32 44, i32 44, i32 41, i32 43, i32 35, i32 43, i32 45, i32 44, i32 37, i32 41, i32 58, i32 62, i32 49, i32 61, i32 56, i32 53, i32 55, i32 48, i32 51, i32 58, i32 58, i32 55, i32 63, i32 55, i32 53, i32 61>
+  %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> zeroinitializer
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_64xi8_perm_mem_mask3(<64 x i8>* %vp) {
+; CHECK-LABEL: test_64xi8_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [5:0.50]
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <64 x i8>, <64 x i8>* %vp
+  %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
+  ret <64 x i8> %res
+}
+define <64 x i8> @test_masked_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %vec2) {
+; CHECK-LABEL: test_masked_64xi8_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm1 # sched: [5:0.50]
+; CHECK-NEXT:    movabsq $29622951609754113, %rax # imm = 0x693DEAE3E5E201
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovq %rax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <64 x i8>, <64 x i8>* %vp
+  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
+  %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> %vec2
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_masked_z_64xi8_perm_mem_mask3(<64 x i8>* %vp) {
+; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovdqa64 (%rdi), %zmm0 # sched: [5:0.50]
+; CHECK-NEXT:    movabsq $29622951609754113, %rax # imm = 0x693DEAE3E5E201
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovq %rax, %k1
+; CHECK-NEXT:    vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <64 x i8>, <64 x i8>* %vp
+  %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> <i32 4, i32 9, i32 11, i32 13, i32 12, i32 6, i32 0, i32 0, i32 11, i32 15, i32 5, i32 7, i32 11, i32 10, i32 4, i32 10, i32 20, i32 21, i32 24, i32 27, i32 18, i32 16, i32 26, i32 16, i32 16, i32 19, i32 26, i32 17, i32 16, i32 31, i32 22, i32 30, i32 35, i32 38, i32 37, i32 34, i32 37, i32 47, i32 43, i32 38, i32 38, i32 36, i32 40, i32 43, i32 42, i32 39, i32 32, i32 46, i32 54, i32 54, i32 48, i32 50, i32 61, i32 56, i32 59, i32 50, i32 53, i32 61, i32 61, i32 51, i32 48, i32 60, i32 50, i32 60>
+  %res = select <64 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <64 x i8> %shuf, <64 x i8> zeroinitializer
+  ret <64 x i8> %res
+}
+
+define <8 x i16> @test_8xi16_perm_high_mask0(<8 x i16> %vec) {
+; CHECK-LABEL: test_8xi16_perm_high_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
+  ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %vec2) {
+; CHECK-LABEL: test_masked_8xi16_perm_high_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-82, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> %vec2
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_high_mask0(<8 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-82, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 7, i32 6>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %vec2) {
+; CHECK-LABEL: test_masked_8xi16_perm_low_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $43, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
+  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i16> %shuf, <8 x i16> %vec2
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_low_mask1(<8 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $43, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
+  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %vec2) {
+; CHECK-LABEL: test_masked_8xi16_perm_high_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $20, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0>, <8 x i16> %shuf, <8 x i16> %vec2
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_high_mask2(<8 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $20, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 4, i32 4, i32 5>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  ret <8 x i16> %res
+}
+define <8 x i16> @test_8xi16_perm_low_mask3(<8 x i16> %vec) {
+; CHECK-LABEL: test_8xi16_perm_low_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %vec2) {
+; CHECK-LABEL: test_masked_8xi16_perm_low_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-20, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x i16> %shuf, <8 x i16> %vec2
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_low_mask3(<8 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-20, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %vec2) {
+; CHECK-LABEL: test_masked_8xi16_perm_high_mask4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-104, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6>
+  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> %vec2
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_high_mask4(<8 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-104, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 7, i32 6>
+  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %vec2) {
+; CHECK-LABEL: test_masked_8xi16_perm_low_mask5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-98, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> %vec2
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_low_mask5(<8 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-98, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 2, i32 1, i32 4, i32 5, i32 6, i32 7>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  ret <8 x i16> %res
+}
+define <8 x i16> @test_8xi16_perm_high_mask6(<8 x i16> %vec) {
+; CHECK-LABEL: test_8xi16_perm_high_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
+  ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %vec2) {
+; CHECK-LABEL: test_masked_8xi16_perm_high_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $117, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0>, <8 x i16> %shuf, <8 x i16> %vec2
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_high_mask6(<8 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $117, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 5>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %vec2) {
+; CHECK-LABEL: test_masked_8xi16_perm_low_mask7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $39, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>, <8 x i16> %shuf, <8 x i16> %vec2
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_low_mask7(<8 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $39, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  ret <8 x i16> %res
+}
+define <8 x i16> @test_8xi16_perm_high_mem_mask0(<8 x i16>* %vp) {
+; CHECK-LABEL: test_8xi16_perm_high_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,7,4,6] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i16>, <8 x i16>* %vp
+  %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
+  ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %vec2) {
+; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-83, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i16>, <8 x i16>* %vp
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> %vec2
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask0(<8 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-83, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i16>, <8 x i16>* %vp
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 4, i32 6>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %vec2) {
+; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-108, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i16>, <8 x i16>* %vp
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> %vec2
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask1(<8 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-108, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i16>, <8 x i16>* %vp
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %vec2) {
+; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-58, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i16>, <8 x i16>* %vp
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 6, i32 5, i32 7>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x i16> %shuf, <8 x i16> %vec2
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask2(<8 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-58, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i16>, <8 x i16>* %vp
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 6, i32 5, i32 7>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_8xi16_perm_low_mem_mask3(<8 x i16>* %vp) {
+; CHECK-LABEL: test_8xi16_perm_low_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 = mem[3,1,2,0,4,5,6,7] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i16>, <8 x i16>* %vp
+  %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %vec2) {
+; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $74, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i16>, <8 x i16>* %vp
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <8 x i16> %shuf, <8 x i16> %vec2
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask3(<8 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $74, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i16>, <8 x i16>* %vp
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %vec2) {
+; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-81, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i16>, <8 x i16>* %vp
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 7, i32 5>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> %vec2
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask4(<8 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-81, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i16>, <8 x i16>* %vp
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 7, i32 5>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %vec2) {
+; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $53, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i16>, <8 x i16>* %vp
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i16> %shuf, <8 x i16> %vec2
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask5(<8 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $53, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i16>, <8 x i16>* %vp
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 2, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_8xi16_perm_high_mem_mask6(<8 x i16>* %vp) {
+; CHECK-LABEL: test_8xi16_perm_high_mem_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,4,4,4] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i16>, <8 x i16>* %vp
+  %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
+  ret <8 x i16> %res
+}
+define <8 x i16> @test_masked_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %vec2) {
+; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-121, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i16>, <8 x i16>* %vp
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> %vec2
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask6(<8 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-121, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i16>, <8 x i16>* %vp
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 4, i32 4>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %vec2) {
+; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $87, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i16>, <8 x i16>* %vp
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 1, i32 4, i32 5, i32 6, i32 7>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <8 x i16> %shuf, <8 x i16> %vec2
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask7(<8 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $87, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i16>, <8 x i16>* %vp
+  %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 1, i32 4, i32 5, i32 6, i32 7>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <8 x i16> %shuf, <8 x i16> zeroinitializer
+  ret <8 x i16> %res
+}
+
+define <16 x i16> @test_16xi16_perm_high_mask0(<16 x i16> %vec) {
+; CHECK-LABEL: test_16xi16_perm_high_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
+  ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %vec2) {
+; CHECK-LABEL: test_masked_16xi16_perm_high_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-3495, %ax # imm = 0xF259
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
+  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_high_mask0(<16 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-3495, %ax # imm = 0xF259
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 14, i32 12>
+  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %vec2) {
+; CHECK-LABEL: test_masked_16xi16_perm_low_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-11903, %ax # imm = 0xD181
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_low_mask1(<16 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-11903, %ax # imm = 0xD181
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 8, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %vec2) {
+; CHECK-LABEL: test_masked_16xi16_perm_high_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-14510, %ax # imm = 0xC752
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_high_mask2(<16 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-14510, %ax # imm = 0xC752
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 5, i32 5, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 13, i32 13, i32 13>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
+}
+define <16 x i16> @test_16xi16_perm_low_mask3(<16 x i16> %vec) {
+; CHECK-LABEL: test_16xi16_perm_low_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %vec2) {
+; CHECK-LABEL: test_masked_16xi16_perm_low_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-16563, %ax # imm = 0xBF4D
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_low_mask3(<16 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-16563, %ax # imm = 0xBF4D
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %vec2) {
+; CHECK-LABEL: test_masked_16xi16_perm_high_mask4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $12298, %ax # imm = 0x300A
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_high_mask4(<16 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $12298, %ax # imm = 0x300A
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 12, i32 15>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %vec2) {
+; CHECK-LABEL: test_masked_16xi16_perm_low_mask5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-29565, %ax # imm = 0x8C83
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_low_mask5(<16 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-29565, %ax # imm = 0x8C83
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
+}
+define <16 x i16> @test_16xi16_perm_high_mask6(<16 x i16> %vec) {
+; CHECK-LABEL: test_16xi16_perm_high_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
+  ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %vec2) {
+; CHECK-LABEL: test_masked_16xi16_perm_high_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $27779, %ax # imm = 0x6C83
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_high_mask6(<16 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $27779, %ax # imm = 0x6C83
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 7, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 14, i32 15, i32 14, i32 13>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %vec2) {
+; CHECK-LABEL: test_masked_16xi16_perm_low_mask7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-3292, %ax # imm = 0xF324
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15>
+  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_low_mask7(<16 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-3292, %ax # imm = 0xF324
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 10, i32 12, i32 13, i32 14, i32 15>
+  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
+}
+define <16 x i16> @test_16xi16_perm_high_mem_mask0(<16 x i16>* %vp) {
+; CHECK-LABEL: test_16xi16_perm_high_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
+  ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2) {
+; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-12838, %ax # imm = 0xCDDA
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask0(<16 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-12838, %ax # imm = 0xCDDA
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 12, i32 15>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2) {
+; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $14962, %ax # imm = 0x3A72
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask1(<16 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $14962, %ax # imm = 0x3A72
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2) {
+; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $1029, %ax # imm = 0x405
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 13, i32 14>
+  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <16 x i16> %shuf, <16 x i16> %vec2
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask2(<16 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $1029, %ax # imm = 0x405
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 6, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 13, i32 14, i32 13, i32 14>
+  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_16xi16_perm_low_mem_mask3(<16 x i16>* %vp) {
+; CHECK-LABEL: test_16xi16_perm_low_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2) {
+; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-30862, %ax # imm = 0x8772
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask3(<16 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-30862, %ax # imm = 0x8772
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %vec2) {
+; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-3845, %ax # imm = 0xF0FB
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 14, i32 15>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask4(<16 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-3845, %ax # imm = 0xF0FB
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 14, i32 15>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %vec2) {
+; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-20955, %ax # imm = 0xAE25
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask5(<16 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-20955, %ax # imm = 0xAE25
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 9, i32 11, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_16xi16_perm_high_mem_mask6(<16 x i16>* %vp) {
+; CHECK-LABEL: test_16xi16_perm_high_mem_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
+  ret <16 x i16> %res
+}
+define <16 x i16> @test_masked_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %vec2) {
+; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-24190, %ax # imm = 0xA182
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask6(<16 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-24190, %ax # imm = 0xA182
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 12, i32 13>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %vec2) {
+; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-24392, %ax # imm = 0xA0B8
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> %vec2
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask7(<16 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-24392, %ax # imm = 0xA0B8
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i16>, <16 x i16>* %vp
+  %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 2, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 10, i32 12, i32 13, i32 14, i32 15>
+  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1>, <16 x i16> %shuf, <16 x i16> zeroinitializer
+  ret <16 x i16> %res
+}
+
+define <32 x i16> @test_32xi16_perm_high_mask0(<32 x i16> %vec) {
+; CHECK-LABEL: test_32xi16_perm_high_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
+  ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %vec2) {
+; CHECK-LABEL: test_masked_32xi16_perm_high_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $1671867126, %eax # imm = 0x63A6AAF6
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
+  %res = select <32 x i1> <i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_high_mask0(<32 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $1671867126, %eax # imm = 0x63A6AAF6
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 28>
+  %res = select <32 x i1> <i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %vec2) {
+; CHECK-LABEL: test_masked_32xi16_perm_low_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $-514766311, %eax # imm = 0xE1514A19
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
+  %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1>, <32 x i16> %shuf, <32 x i16> %vec2
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_low_mask1(<32 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $-514766311, %eax # imm = 0xE1514A19
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
+  %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %vec2) {
+; CHECK-LABEL: test_masked_32xi16_perm_high_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $165000787, %eax # imm = 0x9D5B653
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
+  %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_high_mask2(<32 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $165000787, %eax # imm = 0x9D5B653
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 4, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 14, i32 12, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 22, i32 20, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 30, i32 28, i32 31>
+  %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  ret <32 x i16> %res
+}
+define <32 x i16> @test_32xi16_perm_low_mask3(<32 x i16> %vec) {
+; CHECK-LABEL: test_32xi16_perm_low_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %vec2) {
+; CHECK-LABEL: test_masked_32xi16_perm_low_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $1998504075, %eax # imm = 0x771EC08B
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_low_mask3(<32 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $1998504075, %eax # imm = 0x771EC08B
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 3, i32 1, i32 3, i32 4, i32 5, i32 6, i32 7, i32 11, i32 11, i32 9, i32 11, i32 12, i32 13, i32 14, i32 15, i32 19, i32 19, i32 17, i32 19, i32 20, i32 21, i32 22, i32 23, i32 27, i32 27, i32 25, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %vec2) {
+; CHECK-LABEL: test_masked_32xi16_perm_high_mask4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $-730778639, %eax # imm = 0xD47133F1
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
+  %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1>, <32 x i16> %shuf, <32 x i16> %vec2
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_high_mask4(<32 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $-730778639, %eax # imm = 0xD47133F1
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 7, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 15, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 23, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 31, i32 29, i32 30>
+  %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %vec2) {
+; CHECK-LABEL: test_masked_32xi16_perm_low_mask5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $544659762, %eax # imm = 0x2076D932
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
+  %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_low_mask5(<32 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $544659762, %eax # imm = 0x2076D932
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 1, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 10, i32 9, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15, i32 18, i32 17, i32 17, i32 16, i32 20, i32 21, i32 22, i32 23, i32 26, i32 25, i32 25, i32 24, i32 28, i32 29, i32 30, i32 31>
+  %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  ret <32 x i16> %res
+}
+define <32 x i16> @test_32xi16_perm_high_mask6(<32 x i16> %vec) {
+; CHECK-LABEL: test_32xi16_perm_high_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
+  ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %vec2) {
+; CHECK-LABEL: test_masked_32xi16_perm_high_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $-1243446456, %eax # imm = 0xB5E28348
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
+  %res = select <32 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> %vec2
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_high_mask6(<32 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $-1243446456, %eax # imm = 0xB5E28348
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 12, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 20, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 28, i32 28, i32 29, i32 30>
+  %res = select <32 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %vec2) {
+; CHECK-LABEL: test_masked_32xi16_perm_low_mask7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $1409246810, %eax # imm = 0x53FF665A
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
+  %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_low_mask7(<32 x i16> %vec) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $1409246810, %eax # imm = 0x53FF665A
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 0, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 16, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 24, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
+  %res = select <32 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  ret <32 x i16> %res
+}
+define <32 x i16> @test_32xi16_perm_high_mem_mask0(<32 x i16>* %vp) {
+; CHECK-LABEL: test_32xi16_perm_high_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
+  ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2) {
+; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $-1911488810, %eax # imm = 0x8E10FED6
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
+  %res = select <32 x i1> <i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> %vec2
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask0(<32 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $-1911488810, %eax # imm = 0x8E10FED6
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 5, i32 6, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 13, i32 14, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 21, i32 22, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 29, i32 30>
+  %res = select <32 x i1> <i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2) {
+; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $-1098876619, %eax # imm = 0xBE807935
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 11, i32 11, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 19, i32 19, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 27, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %res = select <32 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> %vec2
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask1(<32 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $-1098876619, %eax # imm = 0xBE807935
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 1, i32 1, i32 3, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 11, i32 11, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 19, i32 19, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 27, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %res = select <32 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2) {
+; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $-1583892148, %eax # imm = 0xA197B94C
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 7, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 15, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 23, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 31, i32 30, i32 28>
+  %res = select <32 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> %vec2
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask2(<32 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $-1583892148, %eax # imm = 0xA197B94C
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 7, i32 6, i32 4, i32 8, i32 9, i32 10, i32 11, i32 12, i32 15, i32 14, i32 12, i32 16, i32 17, i32 18, i32 19, i32 20, i32 23, i32 22, i32 20, i32 24, i32 25, i32 26, i32 27, i32 28, i32 31, i32 30, i32 28>
+  %res = select <32 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_32xi16_perm_low_mem_mask3(<32 x i16>* %vp) {
+; CHECK-LABEL: test_32xi16_perm_low_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2) {
+; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $-216128444, %eax # imm = 0xF31E2444
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %res = select <32 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <32 x i16> %shuf, <32 x i16> %vec2
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask3(<32 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $-216128444, %eax # imm = 0xF31E2444
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 2, i32 2, i32 0, i32 3, i32 4, i32 5, i32 6, i32 7, i32 10, i32 10, i32 8, i32 11, i32 12, i32 13, i32 14, i32 15, i32 18, i32 18, i32 16, i32 19, i32 20, i32 21, i32 22, i32 23, i32 26, i32 26, i32 24, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %res = select <32 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %vec2) {
+; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $1480468153, %eax # imm = 0x583E26B9
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 14, i32 13, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 22, i32 21, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 30, i32 29>
+  %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask4(<32 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $1480468153, %eax # imm = 0x583E26B9
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 4, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 12, i32 14, i32 13, i32 16, i32 17, i32 18, i32 19, i32 23, i32 20, i32 22, i32 21, i32 24, i32 25, i32 26, i32 27, i32 31, i32 28, i32 30, i32 29>
+  %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %vec2) {
+; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15]
+; CHECK-NEXT:    movl $-1778617447, %eax # imm = 0x95FC7399
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 16, i32 17, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 24, i32 25, i32 28, i32 29, i32 30, i32 31>
+  %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> %vec2
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask5(<32 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15]
+; CHECK-NEXT:    movl $-1778617447, %eax # imm = 0x95FC7399
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 16, i32 17, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 24, i32 25, i32 28, i32 29, i32 30, i32 31>
+  %res = select <32 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_32xi16_perm_high_mem_mask6(<32 x i16>* %vp) {
+; CHECK-LABEL: test_32xi16_perm_high_mem_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
+  ret <32 x i16> %res
+}
+define <32 x i16> @test_masked_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %vec2) {
+; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $355619267, %eax # imm = 0x153251C3
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
+  %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0>, <32 x i16> %shuf, <32 x i16> %vec2
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask6(<32 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $355619267, %eax # imm = 0x153251C3
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 6, i32 5, i32 6, i32 6, i32 8, i32 9, i32 10, i32 11, i32 14, i32 13, i32 14, i32 14, i32 16, i32 17, i32 18, i32 19, i32 22, i32 21, i32 22, i32 22, i32 24, i32 25, i32 26, i32 27, i32 30, i32 29, i32 30, i32 30>
+  %res = select <32 x i1> <i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %vec2) {
+; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $-1890659259, %eax # imm = 0x8F4ED445
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 1, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 17, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 25, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
+  %res = select <32 x i1> <i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> %vec2
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask7(<32 x i16>* %vp) {
+; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl $-1890659259, %eax # imm = 0x8F4ED445
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <32 x i16>, <32 x i16>* %vp
+  %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> <i32 3, i32 1, i32 3, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 9, i32 11, i32 8, i32 12, i32 13, i32 14, i32 15, i32 19, i32 17, i32 19, i32 16, i32 20, i32 21, i32 22, i32 23, i32 27, i32 25, i32 27, i32 24, i32 28, i32 29, i32 30, i32 31>
+  %res = select <32 x i1> <i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <32 x i16> %shuf, <32 x i16> zeroinitializer
+  ret <32 x i16> %res
+}
+
+define <4 x i32> @test_4xi32_perm_mask0(<4 x i32> %vec) {
+; CHECK-LABEL: test_4xi32_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,3,3,0] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
+  ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %vec2) {
+; CHECK-LABEL: test_masked_4xi32_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x i32> %shuf, <4 x i32> %vec2
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_4xi32_perm_mask0(<4 x i32> %vec) {
+; CHECK-LABEL: test_masked_z_4xi32_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 0>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x i32> %shuf, <4 x i32> zeroinitializer
+  ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %vec2) {
+; CHECK-LABEL: test_masked_4xi32_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $5, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0>
+  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x i32> %shuf, <4 x i32> %vec2
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_4xi32_perm_mask1(<4 x i32> %vec) {
+; CHECK-LABEL: test_masked_z_4xi32_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $5, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 2, i32 0>
+  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x i32> %shuf, <4 x i32> zeroinitializer
+  ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %vec2) {
+; CHECK-LABEL: test_masked_4xi32_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i32> %shuf, <4 x i32> %vec2
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_4xi32_perm_mask2(<4 x i32> %vec) {
+; CHECK-LABEL: test_masked_z_4xi32_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 0>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i32> %shuf, <4 x i32> zeroinitializer
+  ret <4 x i32> %res
+}
+define <4 x i32> @test_4xi32_perm_mask3(<4 x i32> %vec) {
+; CHECK-LABEL: test_4xi32_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,3] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
+  ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %vec2) {
+; CHECK-LABEL: test_masked_4xi32_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
+  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x i32> %shuf, <4 x i32> %vec2
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_4xi32_perm_mask3(<4 x i32> %vec) {
+; CHECK-LABEL: test_masked_z_4xi32_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3>
+  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x i32> %shuf, <4 x i32> zeroinitializer
+  ret <4 x i32> %res
+}
+define <4 x i32> @test_4xi32_perm_mem_mask0(<4 x i32>* %vp) {
+; CHECK-LABEL: test_4xi32_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,1,3,3] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x i32>, <4 x i32>* %vp
+  %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
+  ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %vec2) {
+; CHECK-LABEL: test_masked_4xi32_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x i32>, <4 x i32>* %vp
+  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
+  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x i32> %shuf, <4 x i32> %vec2
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_4xi32_perm_mem_mask0(<4 x i32>* %vp) {
+; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x i32>, <4 x i32>* %vp
+  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 3>
+  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x i32> %shuf, <4 x i32> zeroinitializer
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %vec2) {
+; CHECK-LABEL: test_masked_4xi32_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $5, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x i32>, <4 x i32>* %vp
+  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 1>
+  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x i32> %shuf, <4 x i32> %vec2
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_4xi32_perm_mem_mask1(<4 x i32>* %vp) {
+; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $5, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x i32>, <4 x i32>* %vp
+  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 3, i32 1>
+  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x i32> %shuf, <4 x i32> zeroinitializer
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %vec2) {
+; CHECK-LABEL: test_masked_4xi32_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x i32>, <4 x i32>* %vp
+  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 0, i32 1>
+  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x i32> %shuf, <4 x i32> %vec2
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_4xi32_perm_mem_mask2(<4 x i32>* %vp) {
+; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x i32>, <4 x i32>* %vp
+  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 0, i32 1>
+  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x i32> %shuf, <4 x i32> zeroinitializer
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_4xi32_perm_mem_mask3(<4 x i32>* %vp) {
+; CHECK-LABEL: test_4xi32_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,0,1,0] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x i32>, <4 x i32>* %vp
+  %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+  ret <4 x i32> %res
+}
+define <4 x i32> @test_masked_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %vec2) {
+; CHECK-LABEL: test_masked_4xi32_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x i32>, <4 x i32>* %vp
+  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 0>, <4 x i32> %shuf, <4 x i32> %vec2
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_masked_z_4xi32_perm_mem_mask3(<4 x i32>* %vp) {
+; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <4 x i32>, <4 x i32>* %vp
+  %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 0>, <4 x i32> %shuf, <4 x i32> zeroinitializer
+  ret <4 x i32> %res
+}
+
+define <8 x i32> @test2_8xi32_perm_mask0(<8 x i32> %vec) {
+; CHECK-LABEL: test2_8xi32_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
+  ret <8 x i32> %res
+}
+define <8 x i32> @test2_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2) {
+; CHECK-LABEL: test2_masked_8xi32_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-99, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> %vec2
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test2_masked_z_8xi32_perm_mask0(<8 x i32> %vec) {
+; CHECK-LABEL: test2_masked_z_8xi32_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-99, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 1, i32 0, i32 6, i32 7, i32 5, i32 4>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+define <8 x i32> @test2_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2) {
+; CHECK-LABEL: test2_masked_8xi32_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-90, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> %vec2
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test2_masked_z_8xi32_perm_mask1(<8 x i32> %vec) {
+; CHECK-LABEL: test2_masked_z_8xi32_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-90, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 3, i32 3, i32 4, i32 7, i32 7, i32 7>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+define <8 x i32> @test2_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2) {
+; CHECK-LABEL: test2_masked_8xi32_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $4, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> %vec2
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test2_masked_z_8xi32_perm_mask2(<8 x i32> %vec) {
+; CHECK-LABEL: test2_masked_z_8xi32_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $4, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 0, i32 3, i32 5, i32 6, i32 4, i32 7>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+define <8 x i32> @test2_8xi32_perm_mask3(<8 x i32> %vec) {
+; CHECK-LABEL: test2_8xi32_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
+  ret <8 x i32> %res
+}
+define <8 x i32> @test2_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2) {
+; CHECK-LABEL: test2_masked_8xi32_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $116, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 # sched: [1:0.25]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> %vec2
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test2_masked_z_8xi32_perm_mask3(<8 x i32> %vec) {
+; CHECK-LABEL: test2_masked_z_8xi32_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $116, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 1, i32 0, i32 5, i32 7, i32 5, i32 4>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+define <8 x i32> @test2_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
+; CHECK-LABEL: test2_8xi32_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = mem[1,0,2,0,5,4,6,4] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i32>, <8 x i32>* %vp
+  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
+  ret <8 x i32> %res
+}
+define <8 x i32> @test2_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2) {
+; CHECK-LABEL: test2_masked_8xi32_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-25, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i32>, <8 x i32>* %vp
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>, <8 x i32> %shuf, <8 x i32> %vec2
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp) {
+; CHECK-LABEL: test2_masked_z_8xi32_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-25, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i32>, <8 x i32>* %vp
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 0, i32 5, i32 4, i32 6, i32 4>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test2_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2) {
+; CHECK-LABEL: test2_masked_8xi32_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-97, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i32>, <8 x i32>* %vp
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> %vec2
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp) {
+; CHECK-LABEL: test2_masked_z_8xi32_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-97, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i32>, <8 x i32>* %vp
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 0, i32 4, i32 7, i32 6, i32 4>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test2_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2) {
+; CHECK-LABEL: test2_masked_8xi32_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $73, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i32>, <8 x i32>* %vp
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 1, i32 7, i32 6, i32 7, i32 5>
+  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> %vec2
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp) {
+; CHECK-LABEL: test2_masked_z_8xi32_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $73, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i32>, <8 x i32>* %vp
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 3, i32 1, i32 7, i32 6, i32 7, i32 5>
+  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test2_8xi32_perm_mem_mask3(<8 x i32>* %vp) {
+; CHECK-LABEL: test2_8xi32_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = mem[3,2,0,0,7,6,4,4] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i32>, <8 x i32>* %vp
+  %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
+  ret <8 x i32> %res
+}
+define <8 x i32> @test2_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2) {
+; CHECK-LABEL: test2_masked_8xi32_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i32>, <8 x i32>* %vp
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> %vec2
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp) {
+; CHECK-LABEL: test2_masked_z_8xi32_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <8 x i32>, <8 x i32>* %vp
+  %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 0, i32 0, i32 7, i32 6, i32 4, i32 4>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+
+define <16 x i32> @test2_16xi32_perm_mask0(<16 x i32> %vec) {
+; CHECK-LABEL: test2_16xi32_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
+  ret <16 x i32> %res
+}
+define <16 x i32> @test2_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2) {
+; CHECK-LABEL: test2_masked_16xi32_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $18453, %ax # imm = 0x4815
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
+  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test2_masked_z_16xi32_perm_mask0(<16 x i32> %vec) {
+; CHECK-LABEL: test2_masked_z_16xi32_perm_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $18453, %ax # imm = 0x4815
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 3, i32 0, i32 7, i32 5, i32 7, i32 4, i32 11, i32 9, i32 11, i32 8, i32 15, i32 13, i32 15, i32 12>
+  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+define <16 x i32> @test2_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2) {
+; CHECK-LABEL: test2_masked_16xi32_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $11142, %ax # imm = 0x2B86
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
+  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test2_masked_z_16xi32_perm_mask1(<16 x i32> %vec) {
+; CHECK-LABEL: test2_masked_z_16xi32_perm_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $11142, %ax # imm = 0x2B86
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 3, i32 0, i32 6, i32 4, i32 7, i32 4, i32 10, i32 8, i32 11, i32 8, i32 14, i32 12, i32 15, i32 12>
+  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+define <16 x i32> @test2_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2) {
+; CHECK-LABEL: test2_masked_16xi32_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $15610, %ax # imm = 0x3CFA
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test2_masked_z_16xi32_perm_mask2(<16 x i32> %vec) {
+; CHECK-LABEL: test2_masked_z_16xi32_perm_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $15610, %ax # imm = 0x3CFA
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 3, i32 3, i32 0, i32 5, i32 7, i32 7, i32 4, i32 9, i32 11, i32 11, i32 8, i32 13, i32 15, i32 15, i32 12>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+define <16 x i32> @test2_16xi32_perm_mask3(<16 x i32> %vec) {
+; CHECK-LABEL: test2_16xi32_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
+  ret <16 x i32> %res
+}
+define <16 x i32> @test2_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2) {
+; CHECK-LABEL: test2_masked_16xi32_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $14814, %ax # imm = 0x39DE
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
+  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test2_masked_z_16xi32_perm_mask3(<16 x i32> %vec) {
+; CHECK-LABEL: test2_masked_z_16xi32_perm_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $14814, %ax # imm = 0x39DE
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 2, i32 0, i32 3, i32 7, i32 6, i32 4, i32 7, i32 11, i32 10, i32 8, i32 11, i32 15, i32 14, i32 12, i32 15>
+  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+define <16 x i32> @test2_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
+; CHECK-LABEL: test2_16xi32_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i32>, <16 x i32>* %vp
+  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
+  ret <16 x i32> %res
+}
+define <16 x i32> @test2_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2) {
+; CHECK-LABEL: test2_masked_16xi32_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $7334, %ax # imm = 0x1CA6
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i32>, <16 x i32>* %vp
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
+  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> %vec2
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp) {
+; CHECK-LABEL: test2_masked_z_16xi32_perm_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $7334, %ax # imm = 0x1CA6
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i32>, <16 x i32>* %vp
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 1, i32 3, i32 5, i32 4, i32 5, i32 7, i32 9, i32 8, i32 9, i32 11, i32 13, i32 12, i32 13, i32 15>
+  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test2_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2) {
+; CHECK-LABEL: test2_masked_16xi32_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-25463, %ax # imm = 0x9C89
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i32>, <16 x i32>* %vp
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 2, i32 5, i32 4, i32 4, i32 6, i32 9, i32 8, i32 8, i32 10, i32 13, i32 12, i32 12, i32 14>
+  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <16 x i32> %shuf, <16 x i32> %vec2
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp) {
+; CHECK-LABEL: test2_masked_z_16xi32_perm_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-25463, %ax # imm = 0x9C89
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i32>, <16 x i32>* %vp
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 2, i32 5, i32 4, i32 4, i32 6, i32 9, i32 8, i32 8, i32 10, i32 13, i32 12, i32 12, i32 14>
+  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test2_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2) {
+; CHECK-LABEL: test2_masked_16xi32_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-14529, %ax # imm = 0xC73F
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i32>, <16 x i32>* %vp
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 1, i32 2, i32 6, i32 4, i32 5, i32 6, i32 10, i32 8, i32 9, i32 10, i32 14, i32 12, i32 13, i32 14>
+  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1>, <16 x i32> %shuf, <16 x i32> %vec2
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp) {
+; CHECK-LABEL: test2_masked_z_16xi32_perm_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-14529, %ax # imm = 0xC73F
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i32>, <16 x i32>* %vp
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 2, i32 0, i32 1, i32 2, i32 6, i32 4, i32 5, i32 6, i32 10, i32 8, i32 9, i32 10, i32 14, i32 12, i32 13, i32 14>
+  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test2_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
+; CHECK-LABEL: test2_16xi32_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpermilps {{.*#+}} zmm0 = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i32>, <16 x i32>* %vp
+  %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
+  ret <16 x i32> %res
+}
+define <16 x i32> @test2_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2) {
+; CHECK-LABEL: test2_masked_16xi32_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-21392, %ax # imm = 0xAC70
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i32>, <16 x i32>* %vp
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
+  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <16 x i32> %shuf, <16 x i32> %vec2
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp) {
+; CHECK-LABEL: test2_masked_z_16xi32_perm_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-21392, %ax # imm = 0xAC70
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec = load <16 x i32>, <16 x i32>* %vp
+  %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> <i32 3, i32 1, i32 1, i32 1, i32 7, i32 5, i32 5, i32 5, i32 11, i32 9, i32 9, i32 9, i32 15, i32 13, i32 13, i32 13>
+  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+
+define <8 x float> @test2_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2) {
+; CHECK-LABEL: test2_8xfloat_shuff_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  ret <8 x float> %res
+}
+define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3) {
+; CHECK-LABEL: test2_8xfloat_masked_shuff_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $-41, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec3
+  ret <8 x float> %res
+}
+
+define <8 x float> @test2_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2) {
+; CHECK-LABEL: test2_8xfloat_zero_masked_shuff_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $-41, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3) {
+; CHECK-LABEL: test2_8xfloat_masked_shuff_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $-63, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec3
+  ret <8 x float> %res
+}
+
+define <8 x float> @test2_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2) {
+; CHECK-LABEL: test2_8xfloat_zero_masked_shuff_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $-63, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3) {
+; CHECK-LABEL: test2_8xfloat_masked_shuff_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $107, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <8 x float> %shuf, <8 x float> %vec3
+  ret <8 x float> %res
+}
+
+define <8 x float> @test2_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2) {
+; CHECK-LABEL: test2_8xfloat_zero_masked_shuff_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $107, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+define <8 x float> @test2_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2) {
+; CHECK-LABEL: test2_8xfloat_shuff_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  ret <8 x float> %res
+}
+define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3) {
+; CHECK-LABEL: test2_8xfloat_masked_shuff_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $66, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vblendmps %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x float> %shuf, <8 x float> %vec3
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2) {
+; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $66, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
+; CHECK-LABEL: test_8xfloat_shuff_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) {
+; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $-24, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec3
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
+; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $-24, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) {
+; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $-6, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec3
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p) {
+; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $-6, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) {
+; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $-50, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec3
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p) {
+; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $-50, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
+; CHECK-LABEL: test_8xfloat_shuff_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) {
+; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $-26, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec3
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
+; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $-26, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovaps %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+
+define <16 x float> @test_16xfloat_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2) {
+; CHECK-LABEL: test_16xfloat_shuff_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],zmm1[2,3,6,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3) {
+; CHECK-LABEL: test_16xfloat_masked_shuff_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-11480, %ax # imm = 0xD328
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15]
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
+  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2) {
+; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-11480, %ax # imm = 0xD328
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
+  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3) {
+; CHECK-LABEL: test_16xfloat_masked_shuff_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-21749, %ax # imm = 0xAB0B
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15]
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2) {
+; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-21749, %ax # imm = 0xAB0B
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3) {
+; CHECK-LABEL: test_16xfloat_masked_shuff_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $75, %ax # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7]
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <16 x float> %shuf, <16 x float> %vec3
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2) {
+; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $75, %ax # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2) {
+; CHECK-LABEL: test_16xfloat_shuff_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7],zmm1[0,1,4,5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
+  ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3) {
+; CHECK-LABEL: test_16xfloat_masked_shuff_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $32347, %ax # imm = 0x7E5B
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11]
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0>, <16 x float> %shuf, <16 x float> %vec3
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2) {
+; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $32347, %ax # imm = 0x7E5B
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
+; CHECK-LABEL: test_16xfloat_shuff_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5],mem[4,5,2,3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+  ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3) {
+; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-19232, %ax # imm = 0xB4E0
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
+; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-19232, %ax # imm = 0xB4E0
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3) {
+; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-29660, %ax # imm = 0x8C24
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p) {
+; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-29660, %ax # imm = 0x8C24
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3) {
+; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-12160, %ax # imm = 0xD080
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 24, i32 25, i32 26, i32 27>
+  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p) {
+; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-12160, %ax # imm = 0xD080
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 24, i32 25, i32 26, i32 27>
+  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
+; CHECK-LABEL: test_16xfloat_shuff_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,6,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3) {
+; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-30129, %ax # imm = 0x8A4F
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
+  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
+; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-30129, %ax # imm = 0x8A4F
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
+  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+
+define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2) {
+; CHECK-LABEL: test_4xdouble_shuff_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3) {
+; CHECK-LABEL: test_4xdouble_masked_shuff_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $13, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec3
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2) {
+; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $13, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3) {
+; CHECK-LABEL: test_4xdouble_masked_shuff_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec3
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2) {
+; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3) {
+; CHECK-LABEL: test_4xdouble_masked_shuff_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec3
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2) {
+; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2) {
+; CHECK-LABEL: test_4xdouble_shuff_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3) {
+; CHECK-LABEL: test_4xdouble_masked_shuff_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vblendmpd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec3
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2) {
+; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
+; CHECK-LABEL: test_4xdouble_shuff_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) {
+; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec3
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
+; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) {
+; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $8, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec3
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p) {
+; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $8, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) {
+; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec3
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p) {
+; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
+; CHECK-LABEL: test_4xdouble_shuff_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) {
+; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $13, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec3
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
+; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $13, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+
+define <8 x double> @test_8xdouble_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2) {
+; CHECK-LABEL: test_8xdouble_shuff_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,2,3],zmm1[6,7,0,1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
+  ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3) {
+; CHECK-LABEL: test_8xdouble_masked_shuff_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $62, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1]
+; CHECK-NEXT:    vmovapd %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec3
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2) {
+; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $62, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 2, i32 3, i32 14, i32 15, i32 8, i32 9>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3) {
+; CHECK-LABEL: test_8xdouble_masked_shuff_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-70, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5]
+; CHECK-NEXT:    vmovapd %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> %vec3
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2) {
+; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-70, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 12, i32 13>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3) {
+; CHECK-LABEL: test_8xdouble_masked_shuff_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $30, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1]
+; CHECK-NEXT:    vmovapd %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec3
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2) {
+; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $30, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 12, i32 13, i32 8, i32 9>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2) {
+; CHECK-LABEL: test_8xdouble_shuff_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,2,3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
+  ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3) {
+; CHECK-LABEL: test_8xdouble_masked_shuff_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $56, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3]
+; CHECK-NEXT:    vmovapd %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
+  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec3
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2) {
+; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $56, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 10, i32 11>
+  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
+; CHECK-LABEL: test_8xdouble_shuff_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],mem[0,1,0,1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
+  ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3) {
+; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $95, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1]
+; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0>, <8 x double> %shuf, <8 x double> %vec3
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
+; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $95, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3) {
+; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-6, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3]
+; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec3
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p) {
+; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-6, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 6, i32 7, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3) {
+; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5]
+; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 12, i32 13>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec3
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p) {
+; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 12, i32 13>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
+; CHECK-LABEL: test_8xdouble_shuff_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[4,5,0,1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
+  ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3) {
+; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1]
+; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec3
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
+; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 12, i32 13, i32 8, i32 9>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+
+define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) {
+; CHECK-LABEL: test_8xi32_shuff_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3) {
+; CHECK-LABEL: test_8xi32_masked_shuff_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $26, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> %vec3
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) {
+; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $26, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3) {
+; CHECK-LABEL: test_8xi32_masked_shuff_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $-4, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i32> %shuf, <8 x i32> %vec3
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2) {
+; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $-4, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3) {
+; CHECK-LABEL: test_8xi32_masked_shuff_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $51, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> %vec3
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2) {
+; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $51, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) {
+; CHECK-LABEL: test_8xi32_shuff_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3) {
+; CHECK-LABEL: test_8xi32_masked_shuff_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $92, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendmd %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> %vec3
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) {
+; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $92, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) {
+; CHECK-LABEL: test_8xi32_shuff_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+  %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3) {
+; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $64, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> %vec3
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) {
+; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $64, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
+  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3) {
+; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $-104, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> %vec3
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p) {
+; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $-104, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3) {
+; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $113, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> %vec3
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p) {
+; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $113, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) {
+; CHECK-LABEL: test_8xi32_shuff_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+  %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  ret <8 x i32> %res
+}
+define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3) {
+; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $45, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> %vec3
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) {
+; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $45, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x i32>, <8 x i32>* %vec2p
+  %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i32> %shuf, <8 x i32> zeroinitializer
+  ret <8 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2) {
+; CHECK-LABEL: test_16xi32_shuff_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,6,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3) {
+; CHECK-LABEL: test_16xi32_masked_shuff_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $2995, %ax # imm = 0xBB3
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15]
+; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> %vec3
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_zero_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2) {
+; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $2995, %ax # imm = 0xBB3
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3) {
+; CHECK-LABEL: test_16xi32_masked_shuff_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $18408, %ax # imm = 0x47E8
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7]
+; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> %vec3
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_zero_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2) {
+; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $18408, %ax # imm = 0x47E8
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3) {
+; CHECK-LABEL: test_16xi32_masked_shuff_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $15737, %ax # imm = 0x3D79
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3]
+; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
+  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> %vec3
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_zero_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2) {
+; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $15737, %ax # imm = 0x3D79
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
+  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2) {
+; CHECK-LABEL: test_16xi32_shuff_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],zmm1[4,5,2,3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+  ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3) {
+; CHECK-LABEL: test_16xi32_masked_shuff_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-3073, %ax # imm = 0xF3FF
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7]
+; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <16 x i32> %shuf, <16 x i32> %vec3
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_zero_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2) {
+; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-3073, %ax # imm = 0xF3FF
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 20, i32 21, i32 22, i32 23>
+  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p) {
+; CHECK-LABEL: test_16xi32_shuff_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],mem[4,5,0,1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+  %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
+  ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3) {
+; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-8166, %ax # imm = 0xE01A
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1>, <16 x i32> %shuf, <16 x i32> %vec3
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p) {
+; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-8166, %ax # imm = 0xE01A
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 16, i32 17, i32 18, i32 19>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3) {
+; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-28302, %ax # imm = 0x9172
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1>, <16 x i32> %shuf, <16 x i32> %vec3
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p) {
+; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-28302, %ax # imm = 0x9172
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3) {
+; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $27158, %ax # imm = 0x6A16
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
+  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> %vec3
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p) {
+; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $27158, %ax # imm = 0x6A16
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 28, i32 29, i32 30, i32 31, i32 28, i32 29, i32 30, i32 31>
+  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p) {
+; CHECK-LABEL: test_16xi32_shuff_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[2,3,6,7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+  %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
+  ret <16 x i32> %res
+}
+define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3) {
+; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $26363, %ax # imm = 0x66FB
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> %vec3
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p) {
+; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $26363, %ax # imm = 0x66FB
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x i32>, <16 x i32>* %vec2p
+  %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 28, i32 29, i32 30, i32 31>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0>, <16 x i32> %shuf, <16 x i32> zeroinitializer
+  ret <16 x i32> %res
+}
+
+define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) {
+; CHECK-LABEL: test_4xi64_shuff_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3) {
+; CHECK-LABEL: test_4xi64_masked_shuff_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $13, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> %vec3
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) {
+; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $13, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3) {
+; CHECK-LABEL: test_4xi64_masked_shuff_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> %vec3
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2) {
+; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3) {
+; CHECK-LABEL: test_4xi64_masked_shuff_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $3, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 0>, <4 x i64> %shuf, <4 x i64> %vec3
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2) {
+; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $3, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 0>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) {
+; CHECK-LABEL: test_4xi64_shuff_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3) {
+; CHECK-LABEL: test_4xi64_masked_shuff_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendmq %ymm0, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> %vec3
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) {
+; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) {
+; CHECK-LABEL: test_4xi64_shuff_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+  %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3) {
+; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x i64> %shuf, <4 x i64> %vec3
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) {
+; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3) {
+; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> %vec3
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p) {
+; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3) {
+; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $8, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> %vec3
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p) {
+; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [3:1.00]
+; CHECK-NEXT:    movb $8, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) {
+; CHECK-LABEL: test_4xi64_shuff_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+  %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  ret <4 x i64> %res
+}
+define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3) {
+; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> %vec3
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) {
+; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [3:1.00]
+; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x i64>, <4 x i64>* %vec2p
+  %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x i64> %shuf, <4 x i64> zeroinitializer
+  ret <4 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2) {
+; CHECK-LABEL: test_8xi64_shuff_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,4,5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3) {
+; CHECK-LABEL: test_8xi64_masked_shuff_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-15, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5]
+; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
+  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> %vec3
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_zero_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2) {
+; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-15, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 4, i32 5, i32 12, i32 13, i32 12, i32 13>
+  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3) {
+; CHECK-LABEL: test_8xi64_masked_shuff_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-17, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5]
+; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> %vec3
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_zero_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2) {
+; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-17, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 10, i32 11, i32 12, i32 13>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3) {
+; CHECK-LABEL: test_8xi64_masked_shuff_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-24, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1]
+; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
+  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> %vec3
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_zero_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2) {
+; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-24, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 8, i32 9, i32 8, i32 9>
+  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2) {
+; CHECK-LABEL: test_8xi64_shuff_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,6,7],zmm1[4,5,2,3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3) {
+; CHECK-LABEL: test_8xi64_masked_shuff_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3]
+; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
+  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec3
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_zero_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2) {
+; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 6, i32 7, i32 12, i32 13, i32 10, i32 11>
+  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p) {
+; CHECK-LABEL: test_8xi64_shuff_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[4,5,2,3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+  %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3) {
+; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-98, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> %vec3
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p) {
+; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-98, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 2, i32 3, i32 12, i32 13, i32 10, i32 11>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3) {
+; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
+  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec3
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p) {
+; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 8, i32 9, i32 8, i32 9>
+  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3) {
+; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $42, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 10, i32 11, i32 10, i32 11>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> %vec3
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p) {
+; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $42, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 10, i32 11, i32 10, i32 11>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p) {
+; CHECK-LABEL: test_8xi64_shuff_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,2,3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+  %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3) {
+; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-6, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3]
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> %vec3
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p) {
+; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-6, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x i64>, <8 x i64>* %vec2p
+  %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 14, i32 15, i32 10, i32 11>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i64> %shuf, <8 x i64> zeroinitializer
+  ret <8 x i64> %res
+}
+
+define <4 x float> @test_4xfloat_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2) {
+; CHECK-LABEL: test_4xfloat_unpack_low_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_low_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x float> %shuf, <4 x float> %vec3
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x float> %shuf, <4 x float> zeroinitializer
+  ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_low_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_low_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x float> %shuf, <4 x float> %vec3
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask1(<4 x float> %vec1, <4 x float> %vec2) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x float> %shuf, <4 x float> zeroinitializer
+  ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_low_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_low_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x float> %shuf, <4 x float> %vec3
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask2(<4 x float> %vec1, <4 x float> %vec2) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
+  ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2) {
+; CHECK-LABEL: test_4xfloat_unpack_low_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_low_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $3, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 0>, <4 x float> %shuf, <4 x float> %vec3
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $3, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
+  ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p) {
+; CHECK-LABEL: test_4xfloat_unpack_low_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x float>, <4 x float>* %vec2p
+  %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_low_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $8, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x float>, <4 x float>* %vec2p
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x float> %shuf, <4 x float> %vec3
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $8, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x float>, <4 x float>* %vec2p
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %res = select <4 x i1> <i1 0, i1 0, i1 0, i1 1>, <4 x float> %shuf, <4 x float> zeroinitializer
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_low_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x float>, <4 x float>* %vec2p
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x float> %shuf, <4 x float> %vec3
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x float>, <4 x float>* %vec2p
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_low_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x float>, <4 x float>* %vec2p
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x float> %shuf, <4 x float> %vec3
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x float>, <4 x float>* %vec2p
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x float> %shuf, <4 x float> zeroinitializer
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p) {
+; CHECK-LABEL: test_4xfloat_unpack_low_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x float>, <4 x float>* %vec2p
+  %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_low_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $4, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x float>, <4 x float>* %vec2p
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> %vec3
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $4, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x float>, <4 x float>* %vec2p
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
+  ret <4 x float> %res
+}
+
+define <8 x float> @test_8xfloat_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2) {
+; CHECK-LABEL: test_8xfloat_unpack_low_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_low_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $122, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0>, <8 x float> %shuf, <8 x float> %vec3
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $122, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_low_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_low_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-107, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %vec3
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask1(<8 x float> %vec1, <8 x float> %vec2) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-107, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_low_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_low_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-25, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec3
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask2(<8 x float> %vec1, <8 x float> %vec2) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-25, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2) {
+; CHECK-LABEL: test_8xfloat_unpack_low_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_low_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-127, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %vec3
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-127, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
+; CHECK-LABEL: test_8xfloat_unpack_low_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_low_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $72, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <8 x float> %shuf, <8 x float> %vec3
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $72, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_low_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-64, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec3
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-64, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_low_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-98, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %vec3
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-98, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
+; CHECK-LABEL: test_8xfloat_unpack_low_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_low_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $64, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x float> %shuf, <8 x float> %vec3
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $64, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+
+define <16 x float> @test_16xfloat_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2) {
+; CHECK-LABEL: test_16xfloat_unpack_low_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_low_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-5916, %ax # imm = 0xE8E4
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-5916, %ax # imm = 0xE8E4
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  %res = select <16 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_low_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_low_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-1130, %ax # imm = 0xFB96
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask1(<16 x float> %vec1, <16 x float> %vec2) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-1130, %ax # imm = 0xFB96
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_low_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_low_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-12439, %ax # imm = 0xCF69
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask2(<16 x float> %vec1, <16 x float> %vec2) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-12439, %ax # imm = 0xCF69
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2) {
+; CHECK-LABEL: test_16xfloat_unpack_low_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_low_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-6413, %ax # imm = 0xE6F3
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-6413, %ax # imm = 0xE6F3
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  %res = select <16 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
+; CHECK-LABEL: test_16xfloat_unpack_low_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_low_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $20326, %ax # imm = 0x4F66
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0>, <16 x float> %shuf, <16 x float> %vec3
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $20326, %ax # imm = 0x4F66
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_low_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-17707, %ax # imm = 0xBAD5
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-17707, %ax # imm = 0xBAD5
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_low_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-6631, %ax # imm = 0xE619
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-6631, %ax # imm = 0xE619
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
+; CHECK-LABEL: test_16xfloat_unpack_low_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_low_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-20711, %ax # imm = 0xAF19
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-20711, %ax # imm = 0xAF19
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+
+define <2 x double> @test_2xdouble_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2) {
+; CHECK-LABEL: test_2xdouble_unpack_low_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+  ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_masked_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3) {
+; CHECK-LABEL: test_2xdouble_masked_unpack_low_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0]
+; CHECK-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+  %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> %vec3
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2) {
+; CHECK-LABEL: test_2xdouble_zero_masked_unpack_low_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+  %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> zeroinitializer
+  ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_masked_unpack_low_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3) {
+; CHECK-LABEL: test_2xdouble_masked_unpack_low_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0]
+; CHECK-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+  %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> %vec3
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask1(<2 x double> %vec1, <2 x double> %vec2) {
+; CHECK-LABEL: test_2xdouble_zero_masked_unpack_low_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+  %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> zeroinitializer
+  ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p) {
+; CHECK-LABEL: test_2xdouble_unpack_low_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <2 x double>, <2 x double>* %vec2p
+  %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+  ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3) {
+; CHECK-LABEL: test_2xdouble_masked_unpack_low_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <2 x double>, <2 x double>* %vec2p
+  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+  %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> %vec3
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p) {
+; CHECK-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <2 x double>, <2 x double>* %vec2p
+  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+  %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> zeroinitializer
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3) {
+; CHECK-LABEL: test_2xdouble_masked_unpack_low_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0]
+; CHECK-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <2 x double>, <2 x double>* %vec2p
+  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+  %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> %vec3
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p) {
+; CHECK-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <2 x double>, <2 x double>* %vec2p
+  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 0, i32 2>
+  %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> zeroinitializer
+  ret <2 x double> %res
+}
+
+define <4 x double> @test_4xdouble_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2) {
+; CHECK-LABEL: test_4xdouble_unpack_low_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_low_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $13, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec3
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $13, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_low_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_low_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec3
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask1(<4 x double> %vec1, <4 x double> %vec2) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_low_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_low_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec3
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask2(<4 x double> %vec1, <4 x double> %vec2) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2) {
+; CHECK-LABEL: test_4xdouble_unpack_low_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_low_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec3
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
+; CHECK-LABEL: test_4xdouble_unpack_low_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_low_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $4, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec3
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $4, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_low_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec3
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_low_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $7, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %res = select <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec3
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $7, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %res = select <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
+; CHECK-LABEL: test_4xdouble_unpack_low_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_low_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 0>, <4 x double> %shuf, <4 x double> %vec3
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+
+define <8 x double> @test_8xdouble_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2) {
+; CHECK-LABEL: test_8xdouble_unpack_low_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_low_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-73, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT:    vmovapd %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> %vec3
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-73, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_low_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_low_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $102, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT:    vmovapd %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0>, <8 x double> %shuf, <8 x double> %vec3
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask1(<8 x double> %vec1, <8 x double> %vec2) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $102, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_low_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_low_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-46, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT:    vmovapd %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec3
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask2(<8 x double> %vec1, <8 x double> %vec2) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-46, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2) {
+; CHECK-LABEL: test_8xdouble_unpack_low_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_low_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-86, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT:    vmovapd %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> %vec3
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-86, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
+; CHECK-LABEL: test_8xdouble_unpack_low_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_low_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec3
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %res = select <8 x i1> <i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_low_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $126, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0>, <8 x double> %shuf, <8 x double> %vec3
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $126, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_low_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-35, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec3
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-35, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
+; CHECK-LABEL: test_8xdouble_unpack_low_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_low_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $62, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> %vec3
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $62, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  %res = select <8 x i1> <i1 0, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+
+define <4 x float> @test_4xfloat_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2) {
+; CHECK-LABEL: test_4xfloat_unpack_high_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_high_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $5, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> %vec3
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $5, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
+  ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_high_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_high_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x float> %shuf, <4 x float> %vec3
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask1(<4 x float> %vec1, <4 x float> %vec2) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x float> %shuf, <4 x float> zeroinitializer
+  ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_high_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_high_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $3, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 0>, <4 x float> %shuf, <4 x float> %vec3
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask2(<4 x float> %vec1, <4 x float> %vec2) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $3, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
+  ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2) {
+; CHECK-LABEL: test_4xfloat_unpack_high_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_high_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $7, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %res = select <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x float> %shuf, <4 x float> %vec3
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $7, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %res = select <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
+  ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p) {
+; CHECK-LABEL: test_4xfloat_unpack_high_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x float>, <4 x float>* %vec2p
+  %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_high_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $4, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x float>, <4 x float>* %vec2p
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> %vec3
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $4, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x float>, <4 x float>* %vec2p
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_high_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $13, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x float>, <4 x float>* %vec2p
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x float> %shuf, <4 x float> %vec3
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $13, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x float>, <4 x float>* %vec2p
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x float> %shuf, <4 x float> zeroinitializer
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_high_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x float>, <4 x float>* %vec2p
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x float> %shuf, <4 x float> %vec3
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x float>, <4 x float>* %vec2p
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x float> %shuf, <4 x float> zeroinitializer
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p) {
+; CHECK-LABEL: test_4xfloat_unpack_high_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x float>, <4 x float>* %vec2p
+  %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  ret <4 x float> %res
+}
+define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3) {
+; CHECK-LABEL: test_4xfloat_masked_unpack_high_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $5, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x float>, <4 x float>* %vec2p
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> %vec3
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p) {
+; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $5, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x float>, <4 x float>* %vec2p
+  %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 0>, <4 x float> %shuf, <4 x float> zeroinitializer
+  ret <4 x float> %res
+}
+
+define <8 x float> @test_8xfloat_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2) {
+; CHECK-LABEL: test_8xfloat_unpack_high_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_high_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $21, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0>, <8 x float> %shuf, <8 x float> %vec3
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $21, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_high_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_high_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $82, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0>, <8 x float> %shuf, <8 x float> %vec3
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask1(<8 x float> %vec1, <8 x float> %vec2) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $82, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_high_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_high_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-126, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %vec3
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask2(<8 x float> %vec1, <8 x float> %vec2) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-126, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2) {
+; CHECK-LABEL: test_8xfloat_unpack_high_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_high_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-19, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> %vec3
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-19, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
+; CHECK-LABEL: test_8xfloat_unpack_high_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_high_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $28, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0>, <8 x float> %shuf, <8 x float> %vec3
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $28, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_high_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-115, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %vec3
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-115, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_high_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-76, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %vec3
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-76, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
+; CHECK-LABEL: test_8xfloat_unpack_high_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x float> %res
+}
+define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3) {
+; CHECK-LABEL: test_8xfloat_masked_unpack_high_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-116, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %vec3
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) {
+; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-116, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x float>, <8 x float>* %vec2p
+  %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1>, <8 x float> %shuf, <8 x float> zeroinitializer
+  ret <8 x float> %res
+}
+
+define <16 x float> @test_16xfloat_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2) {
+; CHECK-LABEL: test_16xfloat_unpack_high_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_high_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-12160, %ax # imm = 0xD080
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-12160, %ax # imm = 0xD080
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  %res = select <16 x i1> <i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_high_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_high_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-30129, %ax # imm = 0x8A4F
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask1(<16 x float> %vec1, <16 x float> %vec2) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-30129, %ax # imm = 0x8A4F
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  %res = select <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_high_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_high_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-2371, %ax # imm = 0xF6BD
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask2(<16 x float> %vec1, <16 x float> %vec2) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-2371, %ax # imm = 0xF6BD
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2) {
+; CHECK-LABEL: test_16xfloat_unpack_high_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_high_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-26006, %ax # imm = 0x9A6A
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-26006, %ax # imm = 0x9A6A
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
+; CHECK-LABEL: test_16xfloat_unpack_high_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_high_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-27027, %ax # imm = 0x966D
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-27027, %ax # imm = 0x966D
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  %res = select <16 x i1> <i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_high_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $29162, %ax # imm = 0x71EA
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0>, <16 x float> %shuf, <16 x float> %vec3
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $29162, %ax # imm = 0x71EA
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  %res = select <16 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_high_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-26458, %ax # imm = 0x98A6
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> %vec3
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $-26458, %ax # imm = 0x98A6
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  %res = select <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0, i1 1>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
+; CHECK-LABEL: test_16xfloat_unpack_high_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x float> %res
+}
+define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3) {
+; CHECK-LABEL: test_16xfloat_masked_unpack_high_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $25225, %ax # imm = 0x6289
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0>, <16 x float> %shuf, <16 x float> %vec3
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) {
+; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $25225, %ax # imm = 0x6289
+; CHECK-NEXT:    # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <16 x float>, <16 x float>* %vec2p
+  %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  %res = select <16 x i1> <i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1, i1 1, i1 0>, <16 x float> %shuf, <16 x float> zeroinitializer
+  ret <16 x float> %res
+}
+
+define <2 x double> @test_2xdouble_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2) {
+; CHECK-LABEL: test_2xdouble_unpack_high_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+  ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_masked_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3) {
+; CHECK-LABEL: test_2xdouble_masked_unpack_high_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1]
+; CHECK-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+  %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> %vec3
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2) {
+; CHECK-LABEL: test_2xdouble_zero_masked_unpack_high_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+  %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> zeroinitializer
+  ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_masked_unpack_high_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3) {
+; CHECK-LABEL: test_2xdouble_masked_unpack_high_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1]
+; CHECK-NEXT:    vmovapd %xmm2, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+  %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> %vec3
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask1(<2 x double> %vec1, <2 x double> %vec2) {
+; CHECK-LABEL: test_2xdouble_zero_masked_unpack_high_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+  %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> zeroinitializer
+  ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p) {
+; CHECK-LABEL: test_2xdouble_unpack_high_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <2 x double>, <2 x double>* %vec2p
+  %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+  ret <2 x double> %res
+}
+define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3) {
+; CHECK-LABEL: test_2xdouble_masked_unpack_high_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
+; CHECK-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <2 x double>, <2 x double>* %vec2p
+  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+  %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> %vec3
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p) {
+; CHECK-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <2 x double>, <2 x double>* %vec2p
+  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+  %res = select <2 x i1> <i1 1, i1 0>, <2 x double> %shuf, <2 x double> zeroinitializer
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3) {
+; CHECK-LABEL: test_2xdouble_masked_unpack_high_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1]
+; CHECK-NEXT:    vmovapd %xmm1, %xmm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <2 x double>, <2 x double>* %vec2p
+  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+  %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> %vec3
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p) {
+; CHECK-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $2, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <2 x double>, <2 x double>* %vec2p
+  %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> <i32 1, i32 3>
+  %res = select <2 x i1> <i1 0, i1 1>, <2 x double> %shuf, <2 x double> zeroinitializer
+  ret <2 x double> %res
+}
+
+define <4 x double> @test_4xdouble_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2) {
+; CHECK-LABEL: test_4xdouble_unpack_high_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_high_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $9, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec3
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $9, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_high_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_high_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec3
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask1(<4 x double> %vec1, <4 x double> %vec2) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $14, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_high_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_high_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec3
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask2(<4 x double> %vec1, <4 x double> %vec2) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $6, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %res = select <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2) {
+; CHECK-LABEL: test_4xdouble_unpack_high_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_high_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT:    vmovapd %ymm2, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 0>, <4 x double> %shuf, <4 x double> %vec3
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $1, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %res = select <4 x i1> <i1 1, i1 0, i1 0, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
+; CHECK-LABEL: test_4xdouble_unpack_high_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_high_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec3
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $11, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %res = select <4 x i1> <i1 1, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_high_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec3
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $12, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %res = select <4 x i1> <i1 0, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_high_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $13, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> %vec3
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $13, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %res = select <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
+; CHECK-LABEL: test_4xdouble_unpack_high_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x double> %res
+}
+define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3) {
+; CHECK-LABEL: test_4xdouble_masked_unpack_high_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0 # sched: [1:1.00]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %vec3
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) {
+; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $10, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <4 x double>, <4 x double>* %vec2p
+  %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> zeroinitializer
+  ret <4 x double> %res
+}
+
+define <8 x double> @test_8xdouble_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2) {
+; CHECK-LABEL: test_8xdouble_unpack_high_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_high_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-27, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT:    vmovapd %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec3
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-27, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %res = select <8 x i1> <i1 1, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_high_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_high_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-21, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT:    vmovapd %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec3
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask1(<8 x double> %vec1, <8 x double> %vec2) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-21, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %res = select <8 x i1> <i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_high_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_high_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-118, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT:    vmovapd %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1>, <8 x double> %shuf, <8 x double> %vec3
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask2(<8 x double> %vec1, <8 x double> %vec2) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-118, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %res = select <8 x i1> <i1 0, i1 1, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2) {
+; CHECK-LABEL: test_8xdouble_unpack_high_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_high_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $100, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT:    vmovapd %zmm2, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0>, <8 x double> %shuf, <8 x double> %vec3
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $100, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
+; CHECK-LABEL: test_8xdouble_unpack_high_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_high_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-76, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
+; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> %vec3
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-76, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %res = select <8 x i1> <i1 0, i1 0, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_high_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $71, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
+; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x double> %shuf, <8 x double> %vec3
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $71, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 1, i1 0>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_high_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-49, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
+; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec3
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-49, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 0, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
+; CHECK-LABEL: test_8xdouble_unpack_high_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x double> %res
+}
+define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3) {
+; CHECK-LABEL: test_8xdouble_masked_unpack_high_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-40, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
+; CHECK-NEXT:    vmovapd %zmm1, %zmm0
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> %vec3
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) {
+; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb $-40, %al # sched: [1:0.25]
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7]
+; CHECK-NEXT:    retq # sched: [2:1.00]
+  %vec2 = load <8 x double>, <8 x double>* %vec2p
+  %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  %res = select <8 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1>, <8 x double> %shuf, <8 x double> zeroinitializer
+  ret <8 x double> %res
+}
+

Propchange: llvm/trunk/test/CodeGen/X86/avx512-shuffle-schedule.ll
------------------------------------------------------------------------------
    svn:executable = *




More information about the llvm-commits mailing list