[llvm] r307500 - [X86][AVX512] Regenerate AVX512VL comparison tests.

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Sun Jul 9 08:47:43 PDT 2017


Modified: llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll?rev=307500&r1=307499&r2=307500&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll Sun Jul  9 08:47:43 2017
@@ -1,13 +1,124 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -check-prefix=NoVLX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=VLX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=NoVLX
 
 define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqb_v16i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi0:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi1:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi2:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi3:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi4:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi5:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi6:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi7:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <16 x i8>
   %1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -18,11 +129,122 @@ entry:
 }
 
 define zeroext i32 @test_vpcmpeqb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqb_v16i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqb (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqb (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi8:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi9:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi10:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi11:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi12:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi13:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi14:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi15:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpeqb (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <16 x i8>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -34,12 +256,124 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi16:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi17:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi18:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi19:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi20:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi21:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi22:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi23:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <16 x i8>
   %1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -52,12 +386,124 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpeqb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqb (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqb (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi24:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi25:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi26:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi27:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi28:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi29:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi30:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi31:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpeqb (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <16 x i8>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -72,11 +518,127 @@ entry:
 
 
 define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqb_v16i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi32:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi33:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi34:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi35:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi36:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi37:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi38:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi39:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <16 x i8>
   %1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -87,11 +649,127 @@ entry:
 }
 
 define zeroext i64 @test_vpcmpeqb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqb_v16i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqb (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqb (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi40:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi41:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi42:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi43:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi44:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi45:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi46:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi47:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpeqb (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <16 x i8>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -103,12 +781,129 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi48:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi49:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi50:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi51:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi52:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi53:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi54:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi55:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <16 x i8>
   %1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -121,12 +916,129 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqb (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqb (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi56:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi57:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi58:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi59:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi60:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi61:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi62:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi63:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpeqb (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <16 x i8>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -141,12 +1053,46 @@ entry:
 
 
 define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqb_v32i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqb %ymm1, %ymm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi64:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi65:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi66:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %ecx
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    shlq $32, %rax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <32 x i8>
   %1 = bitcast <4 x i64> %__b to <32 x i8>
@@ -157,12 +1103,46 @@ entry:
 }
 
 define zeroext i64 @test_vpcmpeqb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqb_v32i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqb (%rdi), %ymm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqb (%rdi), %ymm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi67:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi68:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi69:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpeqb (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %ecx
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    shlq $32, %rax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <32 x i8>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -174,13 +1154,56 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqb %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi70:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi71:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi72:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $96, %rsp
+; NoVLX-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vpmovdb %zmm2, %xmm2
+; NoVLX-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
+; NoVLX-NEXT:    vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT:    vpxord %zmm4, %zmm4, %zmm4
+; NoVLX-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %ecx
+; NoVLX-NEXT:    vptestmd %zmm4, %zmm4, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    shlq $32, %rax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <32 x i8>
   %1 = bitcast <4 x i64> %__b to <32 x i8>
@@ -193,13 +1216,56 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqb (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqb (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi73:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi74:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi75:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $96, %rsp
+; NoVLX-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; NoVLX-NEXT:    vpmovdb %zmm2, %xmm2
+; NoVLX-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; NoVLX-NEXT:    vpcmpeqb (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; NoVLX-NEXT:    vpand %xmm2, %xmm4, %xmm2
+; NoVLX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %ecx
+; NoVLX-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    shlq $32, %rax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <32 x i8>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -214,11 +1280,24 @@ entry:
 
 
 define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v8i1_v16i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqw_v8i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v8i1_v16i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <8 x i16>
   %1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -229,11 +1308,24 @@ entry:
 }
 
 define zeroext i16 @test_vpcmpeqw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v8i1_v16i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqw (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqw_v8i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqw (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v8i1_v16i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqw (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <8 x i16>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -245,12 +1337,26 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <8 x i16>
   %1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -263,12 +1369,26 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpeqw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v16i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqw (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <8 x i16>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -283,11 +1403,72 @@ entry:
 
 
 define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v8i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi76:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi77:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi78:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <8 x i16>
   %1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -298,11 +1479,72 @@ entry:
 }
 
 define zeroext i32 @test_vpcmpeqw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v8i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqw (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqw (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi79:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi80:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi81:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpeqw (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <8 x i16>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -314,12 +1556,74 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi82:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi83:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi84:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <8 x i16>
   %1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -332,12 +1636,74 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpeqw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi85:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi86:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi87:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpeqw (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <8 x i16>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -352,11 +1718,77 @@ entry:
 
 
 define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v8i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi88:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi89:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi90:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <8 x i16>
   %1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -367,11 +1799,77 @@ entry:
 }
 
 define zeroext i64 @test_vpcmpeqw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v8i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqw (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqw (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi91:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi92:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi93:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpeqw (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <8 x i16>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -383,12 +1881,79 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi94:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi95:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi96:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <8 x i16>
   %1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -401,12 +1966,79 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi97:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi98:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi99:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpeqw (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <8 x i16>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -421,12 +2053,123 @@ entry:
 
 
 define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v16i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi100:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi101:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi102:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi103:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi104:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi105:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi106:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi107:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <16 x i16>
   %1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -437,12 +2180,123 @@ entry:
 }
 
 define zeroext i32 @test_vpcmpeqw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v16i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqw (%rdi), %ymm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqw (%rdi), %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi108:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi109:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi110:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi111:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi112:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi113:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi114:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi115:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpeqw (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <16 x i16>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -454,13 +2308,125 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi116:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi117:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi118:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi119:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi120:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi121:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi122:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi123:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <16 x i16>
   %1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -473,13 +2439,125 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpeqw_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqw (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqw (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi124:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi125:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi126:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi127:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi128:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi129:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi130:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi131:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpeqw (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <16 x i16>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -494,12 +2572,128 @@ entry:
 
 
 define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v16i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi132:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi133:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi134:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi135:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi136:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi137:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi138:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi139:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <16 x i16>
   %1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -510,12 +2704,128 @@ entry:
 }
 
 define zeroext i64 @test_vpcmpeqw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v16i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqw (%rdi), %ymm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqw (%rdi), %ymm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi140:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi141:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi142:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi143:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi144:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi145:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi146:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi147:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpeqw (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <16 x i16>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -527,13 +2837,130 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi148:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi149:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi150:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi151:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi152:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi153:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi154:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi155:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <16 x i16>
   %1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -546,13 +2973,130 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqw_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqw (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqw (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi156:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi157:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi158:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi159:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi160:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi161:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi162:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi163:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpeqw (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <16 x i16>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -567,12 +3111,348 @@ entry:
 
 
 define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v32i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi164:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi165:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi166:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT:    vmovq %xmm3, %rax
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    movq %rax, %rdx
+; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm5
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm8
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm6
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm7
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm2, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm2
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm7, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm6, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm7
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm6
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm8, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm3
+; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
+; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm3, %ymm1
+; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %ecx
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    shlq $32, %rax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <32 x i16>
   %1 = bitcast <8 x i64> %__b to <32 x i16>
@@ -583,12 +3463,263 @@ entry:
 }
 
 define zeroext i64 @test_vpcmpeqw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqw (%rdi), %zmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqw (%rdi), %zmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi167:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi168:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi169:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rax
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    movq %rax, %rdx
+; NoVLX-NEXT:    vmovd %eax, %xmm1
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm3
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; NoVLX-NEXT:    vpcmpeqw 32(%rdi), %ymm1, %ymm1
+; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %eax, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpcmpeqw (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %ecx
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    shlq $32, %rax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <32 x i16>
   %load = load <8 x i64>, <8 x i64>* %__b
@@ -600,13 +3731,358 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi170:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi171:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi172:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $96, %rsp
+; NoVLX-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rax
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    movq %rax, %rdx
+; NoVLX-NEXT:    vmovd %eax, %xmm3
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm4
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm8
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm6
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm2
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm3, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm9
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm6, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm6
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm7, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm5, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm7
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm8, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vinserti128 $1, %xmm9, %ymm4, %ymm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm8
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vinserti128 $1, %xmm7, %ymm3, %ymm3
+; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT:    vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z}
+; NoVLX-NEXT:    vpcmpeqw %ymm3, %ymm1, %ymm4
+; NoVLX-NEXT:    vpmovdb %zmm6, %xmm6
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm2
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm2, %ymm2
+; NoVLX-NEXT:    vpcmpeqw %ymm2, %ymm8, %ymm2
+; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm3
+; NoVLX-NEXT:    vpmovsxwd %ymm4, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpand %xmm6, %xmm2, %xmm2
+; NoVLX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpand %xmm0, %xmm3, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %ecx
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    shlq $32, %rax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <32 x i16>
   %1 = bitcast <8 x i64> %__b to <32 x i16>
@@ -619,13 +4095,273 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqw_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqw (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqw (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi173:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi174:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi175:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $96, %rsp
+; NoVLX-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; NoVLX-NEXT:    vmovq %xmm1, %rax
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    movq %rax, %rdx
+; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm2
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm3, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm4
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm3
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm1, %xmm0
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm5
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpmovdb %zmm0, %xmm1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm4
+; NoVLX-NEXT:    vpmovdb %zmm0, %xmm2
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm5, %ymm3
+; NoVLX-NEXT:    vpcmpeqw (%rsi), %ymm3, %ymm3
+; NoVLX-NEXT:    vpmovsxwd %ymm3, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm3
+; NoVLX-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %eax, %xmm3
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpcmpeqw 32(%rsi), %ymm4, %ymm4
+; NoVLX-NEXT:    vpmovsxwd %ymm4, %zmm4
+; NoVLX-NEXT:    vpslld $31, %zmm4, %zmm4
+; NoVLX-NEXT:    vptestmd %zmm4, %zmm4, %k0
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpand %xmm2, %xmm4, %xmm2
+; NoVLX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpand %xmm1, %xmm3, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %ecx
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    shlq $32, %rax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <32 x i16>
   %load = load <8 x i64>, <8 x i64>* %__b
@@ -640,11 +4376,51 @@ entry:
 
 
 define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v8i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -655,11 +4431,51 @@ entry:
 }
 
 define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqd (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqd (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -671,12 +4487,70 @@ entry:
 }
 
 define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -690,12 +4564,70 @@ entry:
 }
 
 define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -711,11 +4643,52 @@ entry:
 
 
 define zeroext i8 @test_vpcmpeqd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqd (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqd (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v8i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load i32, i32* %__b
@@ -728,12 +4701,71 @@ entry:
 }
 
 define zeroext i8 @test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load i32, i32* %__b
@@ -750,11 +4782,50 @@ entry:
 
 
 define zeroext i16 @test_vpcmpeqd_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v16i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -765,11 +4836,50 @@ entry:
 }
 
 define zeroext i16 @test_vpcmpeqd_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqd (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqd (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -781,12 +4891,69 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -800,12 +4967,69 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -821,11 +5045,51 @@ entry:
 
 
 define zeroext i16 @test_vpcmpeqd_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqd (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqd (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v16i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load i32, i32* %__b
@@ -838,12 +5102,70 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load i32, i32* %__b
@@ -860,11 +5182,39 @@ entry:
 
 
 define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi176:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi177:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi178:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -875,11 +5225,39 @@ entry:
 }
 
 define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqd (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqd (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi179:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi180:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi181:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -891,12 +5269,58 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi182:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi183:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi184:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -910,12 +5334,58 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi185:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi186:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi187:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpeqd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -931,11 +5401,40 @@ entry:
 
 
 define zeroext i32 @test_vpcmpeqd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqd (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqd (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi188:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi189:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi190:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load i32, i32* %__b
@@ -948,12 +5447,59 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi191:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi192:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi193:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load i32, i32* %__b
@@ -970,11 +5516,46 @@ entry:
 
 
 define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi194:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi195:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi196:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -985,11 +5566,46 @@ entry:
 }
 
 define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqd (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqd (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi197:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi198:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi199:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -1001,12 +5617,65 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi200:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi201:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi202:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -1020,12 +5689,65 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi203:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi204:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi205:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpeqd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -1041,11 +5763,47 @@ entry:
 
 
 define zeroext i64 @test_vpcmpeqd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqd (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqd (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi206:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi207:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi208:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load i32, i32* %__b
@@ -1058,12 +5816,66 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqd (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi209:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi210:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi211:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load i32, i32* %__b
@@ -1080,21 +5892,23 @@ entry:
 
 
 define zeroext i16 @test_vpcmpeqd_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v8i1_v16i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
 ;
 ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask:
-; NoVLX:       ## BB#0: ## %entry
-; NoVLX-NEXT:    ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -1106,21 +5920,23 @@ entry:
 }
 
 define zeroext i16 @test_vpcmpeqd_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqd (%rdi), %ymm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqd (%rdi), %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
 ;
 ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem:
-; NoVLX:       ## BB#0: ## %entry
-; NoVLX-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -1133,23 +5949,25 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpeqd_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask:
-; NoVLX:       ## BB#0: ## %entry
-; NoVLX-NEXT:    ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -1163,23 +5981,25 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpeqd_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqd (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem:
-; NoVLX:       ## BB#0: ## %entry
-; NoVLX-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -1195,21 +6015,23 @@ entry:
 
 
 define zeroext i16 @test_vpcmpeqd_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqd (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqd (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
 ;
 ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v16i1_mask_mem_b:
-; NoVLX:       ## BB#0: ## %entry
-; NoVLX-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -1223,23 +6045,25 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v16i1_mask_mem_b:
-; NoVLX:       ## BB#0: ## %entry
-; NoVLX-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -1256,12 +6080,72 @@ entry:
 
 
 define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v8i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi212:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi213:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi214:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
   %1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -1272,12 +6156,72 @@ entry:
 }
 
 define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqd (%rdi), %ymm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqd (%rdi), %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi215:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi216:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi217:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -1289,13 +6233,75 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi218:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi219:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi220:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    kandw %k1, %k0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
   %1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -1308,13 +6314,75 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqd (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi221:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi222:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi223:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    kandw %k1, %k0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -1329,12 +6397,72 @@ entry:
 
 
 define zeroext i32 @test_vpcmpeqd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqd (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqd (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi224:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi225:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi226:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vpbroadcastd (%rdi), %ymm1
+; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
   %load = load i32, i32* %__b
@@ -1347,13 +6475,75 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi227:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi228:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi229:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vpbroadcastd (%rsi), %ymm1
+; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    kandw %k0, %k1, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
   %load = load i32, i32* %__b
@@ -1369,12 +6559,77 @@ entry:
 
 
 define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v8i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi230:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi231:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi232:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
   %1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -1385,12 +6640,77 @@ entry:
 }
 
 define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqd (%rdi), %ymm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqd (%rdi), %ymm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi233:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi234:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi235:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -1402,13 +6722,80 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi236:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi237:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi238:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    kandw %k1, %k0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
   %1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -1421,13 +6808,80 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqd (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi239:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi240:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi241:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    kandw %k1, %k0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -1442,12 +6896,77 @@ entry:
 
 
 define zeroext i64 @test_vpcmpeqd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqd (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqd (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi242:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi243:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi244:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vpbroadcastd (%rdi), %ymm1
+; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
   %load = load i32, i32* %__b
@@ -1460,13 +6979,80 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqd (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi245:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi246:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi247:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vpbroadcastd (%rsi), %ymm1
+; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    kandw %k0, %k1, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
   %load = load i32, i32* %__b
@@ -1482,12 +7068,120 @@ entry:
 
 
 define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v16i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi248:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi249:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi250:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi251:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi252:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi253:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi254:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi255:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <16 x i32>
   %1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -1498,12 +7192,120 @@ entry:
 }
 
 define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqd (%rdi), %zmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqd (%rdi), %zmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi256:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi257:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi258:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi259:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi260:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi261:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi262:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi263:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpeqd (%rdi), %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <16 x i32>
   %load = load <8 x i64>, <8 x i64>* %__b
@@ -1515,13 +7317,122 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi264:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi265:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi266:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi267:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi268:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi269:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi270:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi271:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <16 x i32>
   %1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -1534,13 +7445,122 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi272:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi273:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi274:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi275:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi276:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi277:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi278:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi279:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <16 x i32>
   %load = load <8 x i64>, <8 x i64>* %__b
@@ -1555,12 +7575,120 @@ entry:
 
 
 define zeroext i32 @test_vpcmpeqd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqd (%rdi){1to16}, %zmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqd (%rdi){1to16}, %zmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi280:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi281:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi282:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi283:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi284:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi285:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi286:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi287:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpeqd (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <16 x i32>
   %load = load i32, i32* %__b
@@ -1573,13 +7701,122 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi288:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi289:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi290:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi291:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi292:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi293:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi294:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi295:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <16 x i32>
   %load = load i32, i32* %__b
@@ -1595,12 +7832,125 @@ entry:
 
 
 define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v16i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi296:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi297:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi298:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi299:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi300:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi301:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi302:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi303:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <16 x i32>
   %1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -1611,12 +7961,125 @@ entry:
 }
 
 define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqd (%rdi), %zmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqd (%rdi), %zmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi304:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi305:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi306:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi307:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi308:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi309:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi310:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi311:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpeqd (%rdi), %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <16 x i32>
   %load = load <8 x i64>, <8 x i64>* %__b
@@ -1628,13 +8091,127 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi312:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi313:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi314:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi315:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi316:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi317:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi318:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi319:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <16 x i32>
   %1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -1647,13 +8224,127 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi320:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi321:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi322:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi323:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi324:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi325:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi326:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi327:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <16 x i32>
   %load = load <8 x i64>, <8 x i64>* %__b
@@ -1668,12 +8359,125 @@ entry:
 
 
 define zeroext i64 @test_vpcmpeqd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqd (%rdi){1to16}, %zmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqd (%rdi){1to16}, %zmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi328:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi329:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi330:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi331:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi332:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi333:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi334:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi335:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpeqd (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <16 x i32>
   %load = load i32, i32* %__b
@@ -1686,13 +8490,127 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi336:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi337:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi338:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi339:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi340:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi341:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi342:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi343:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <16 x i32>
   %load = load i32, i32* %__b
@@ -1708,12 +8626,23 @@ entry:
 
 
 define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v4i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -1724,12 +8653,23 @@ entry:
 }
 
 define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -1741,13 +8681,34 @@ entry:
 }
 
 define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -1761,13 +8722,34 @@ entry:
 }
 
 define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -1783,12 +8765,24 @@ entry:
 
 
 define zeroext i4 @test_vpcmpeqq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v4i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load i64, i64* %__b
@@ -1801,13 +8795,35 @@ entry:
 }
 
 define zeroext i4 @test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load i64, i64* %__b
@@ -1824,11 +8840,35 @@ entry:
 
 
 define zeroext i8 @test_vpcmpeqq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v8i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -1839,11 +8879,35 @@ entry:
 }
 
 define zeroext i8 @test_vpcmpeqq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -1855,12 +8919,46 @@ entry:
 }
 
 define zeroext i8 @test_masked_vpcmpeqq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -1874,12 +8972,46 @@ entry:
 }
 
 define zeroext i8 @test_masked_vpcmpeqq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -1895,11 +9027,36 @@ entry:
 
 
 define zeroext i8 @test_vpcmpeqq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v8i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load i64, i64* %__b
@@ -1912,12 +9069,47 @@ entry:
 }
 
 define zeroext i8 @test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v8i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load i64, i64* %__b
@@ -1934,11 +9126,34 @@ entry:
 
 
 define zeroext i16 @test_vpcmpeqq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v16i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -1949,11 +9164,34 @@ entry:
 }
 
 define zeroext i16 @test_vpcmpeqq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -1965,12 +9203,45 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpeqq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -1984,12 +9255,45 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpeqq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -2005,11 +9309,35 @@ entry:
 
 
 define zeroext i16 @test_vpcmpeqq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v16i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load i64, i64* %__b
@@ -2022,12 +9350,46 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v16i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load i64, i64* %__b
@@ -2044,11 +9406,39 @@ entry:
 
 
 define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi344:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi345:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi346:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -2059,11 +9449,39 @@ entry:
 }
 
 define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi347:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi348:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi349:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpeqq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -2075,12 +9493,50 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi350:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi351:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi352:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -2094,12 +9550,50 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi353:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi354:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi355:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpeqq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -2115,11 +9609,40 @@ entry:
 
 
 define zeroext i32 @test_vpcmpeqq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi356:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi357:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi358:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load i64, i64* %__b
@@ -2132,12 +9655,51 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi359:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi360:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi361:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load i64, i64* %__b
@@ -2154,11 +9716,46 @@ entry:
 
 
 define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi362:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi363:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi364:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -2169,11 +9766,46 @@ entry:
 }
 
 define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi365:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi366:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi367:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpeqq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -2185,12 +9817,57 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi368:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi369:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi370:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -2204,12 +9881,57 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi371:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi372:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi373:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpeqq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -2225,11 +9947,47 @@ entry:
 
 
 define zeroext i64 @test_vpcmpeqq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi374:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi375:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi376:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load i64, i64* %__b
@@ -2242,12 +10000,58 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi377:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi378:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi379:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load i64, i64* %__b
@@ -2264,12 +10068,53 @@ entry:
 
 
 define zeroext i8 @test_vpcmpeqq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v8i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k0
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -2280,12 +10125,53 @@ entry:
 }
 
 define zeroext i8 @test_vpcmpeqq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq (%rdi), %ymm0, %k0
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq (%rdi), %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqq (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -2297,13 +10183,72 @@ entry:
 }
 
 define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -2317,13 +10262,72 @@ entry:
 }
 
 define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqq (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -2339,12 +10343,54 @@ entry:
 
 
 define zeroext i8 @test_vpcmpeqq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v8i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load i64, i64* %__b
@@ -2357,13 +10403,73 @@ entry:
 }
 
 define zeroext i8 @test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load i64, i64* %__b
@@ -2380,12 +10486,52 @@ entry:
 
 
 define zeroext i16 @test_vpcmpeqq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v16i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -2396,12 +10542,52 @@ entry:
 }
 
 define zeroext i16 @test_vpcmpeqq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq (%rdi), %ymm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq (%rdi), %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqq (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -2413,13 +10599,71 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -2433,13 +10677,71 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqq (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -2455,12 +10757,53 @@ entry:
 
 
 define zeroext i16 @test_vpcmpeqq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v16i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load i64, i64* %__b
@@ -2473,13 +10816,72 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load i64, i64* %__b
@@ -2496,12 +10898,41 @@ entry:
 
 
 define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi380:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi381:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi382:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -2512,12 +10943,41 @@ entry:
 }
 
 define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq (%rdi), %ymm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq (%rdi), %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi383:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi384:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi385:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpeqq (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -2529,13 +10989,60 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi386:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi387:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi388:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -2549,13 +11056,60 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi389:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi390:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi391:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpeqq (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -2571,12 +11125,42 @@ entry:
 
 
 define zeroext i32 @test_vpcmpeqq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi392:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi393:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi394:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load i64, i64* %__b
@@ -2589,13 +11173,61 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi395:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi396:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi397:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load i64, i64* %__b
@@ -2612,12 +11244,48 @@ entry:
 
 
 define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi398:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi399:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi400:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -2628,12 +11296,48 @@ entry:
 }
 
 define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq (%rdi), %ymm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq (%rdi), %ymm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi401:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi402:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi403:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpeqq (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -2645,13 +11349,67 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi404:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi405:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi406:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -2665,13 +11423,67 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi407:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi408:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi409:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpeqq (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -2687,12 +11499,49 @@ entry:
 
 
 define zeroext i64 @test_vpcmpeqq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi410:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi411:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi412:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load i64, i64* %__b
@@ -2705,13 +11554,68 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi413:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi414:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi415:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load i64, i64* %__b
@@ -2728,12 +11632,20 @@ entry:
 
 
 define zeroext i16 @test_vpcmpeqq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v8i1_v16i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -2744,12 +11656,20 @@ entry:
 }
 
 define zeroext i16 @test_vpcmpeqq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq (%rdi), %zmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq (%rdi), %zmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqq (%rdi), %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %load = load <8 x i64>, <8 x i64>* %__b
@@ -2761,13 +11681,22 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpeqq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -2780,13 +11709,22 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpeqq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %load = load <8 x i64>, <8 x i64>* %__b
@@ -2801,12 +11739,20 @@ entry:
 
 
 define zeroext i16 @test_vpcmpeqq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v8i1_v16i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpeqq (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %load = load i64, i64* %__b
@@ -2819,13 +11765,22 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v16i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %load = load i64, i64* %__b
@@ -2841,12 +11796,70 @@ entry:
 
 
 define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v8i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi416:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi417:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi418:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -2857,12 +11870,70 @@ entry:
 }
 
 define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq (%rdi), %zmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq (%rdi), %zmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi419:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi420:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi421:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpeqq (%rdi), %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %load = load <8 x i64>, <8 x i64>* %__b
@@ -2874,13 +11945,72 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi422:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi423:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi424:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -2893,13 +12023,72 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi425:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi426:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi427:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %load = load <8 x i64>, <8 x i64>* %__b
@@ -2914,12 +12103,70 @@ entry:
 
 
 define zeroext i32 @test_vpcmpeqq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi428:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi429:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi430:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpeqq (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %load = load i64, i64* %__b
@@ -2932,13 +12179,72 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi431:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi432:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi433:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %load = load i64, i64* %__b
@@ -2954,12 +12260,75 @@ entry:
 
 
 define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v8i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi434:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi435:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi436:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -2970,12 +12339,75 @@ entry:
 }
 
 define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq (%rdi), %zmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq (%rdi), %zmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi437:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi438:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi439:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpeqq (%rdi), %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %load = load <8 x i64>, <8 x i64>* %__b
@@ -2987,13 +12419,77 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi440:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi441:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi442:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -3006,13 +12502,77 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi443:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi444:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi445:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %load = load <8 x i64>, <8 x i64>* %__b
@@ -3027,12 +12587,75 @@ entry:
 
 
 define zeroext i64 @test_vpcmpeqq_v8i1_v64i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpeqq (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpeqq (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi446:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi447:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi448:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpeqq (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %load = load i64, i64* %__b
@@ -3045,13 +12668,77 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi449:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi450:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi451:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %load = load i64, i64* %__b
@@ -3067,11 +12754,122 @@ entry:
 
 
 define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtb %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtb %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi452:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi453:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi454:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi455:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi456:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi457:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi458:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi459:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <16 x i8>
   %1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -3082,11 +12880,122 @@ entry:
 }
 
 define zeroext i32 @test_vpcmpsgtb_v16i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtb (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtb (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi460:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi461:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi462:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi463:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi464:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi465:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi466:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi467:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpgtb (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <16 x i8>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -3098,12 +13007,124 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtb %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtb %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi468:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi469:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi470:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi471:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi472:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi473:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi474:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi475:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <16 x i8>
   %1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -3116,12 +13137,124 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtb (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtb (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi476:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi477:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi478:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi479:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi480:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi481:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi482:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi483:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpgtb (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <16 x i8>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -3136,11 +13269,127 @@ entry:
 
 
 define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtb %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtb %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi484:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi485:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi486:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi487:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi488:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi489:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi490:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi491:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <16 x i8>
   %1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -3151,11 +13400,127 @@ entry:
 }
 
 define zeroext i64 @test_vpcmpsgtb_v16i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtb (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtb (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi492:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi493:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi494:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi495:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi496:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi497:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi498:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi499:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpgtb (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <16 x i8>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -3167,12 +13532,129 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask(i16 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtb %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtb %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi500:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi501:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi502:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi503:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi504:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi505:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi506:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi507:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <16 x i8>
   %1 = bitcast <2 x i64> %__b to <16 x i8>
@@ -3185,12 +13667,129 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem(i16 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtb (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtb (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi508:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi509:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi510:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi511:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi512:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi513:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi514:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi515:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpgtb (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <16 x i8>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -3205,12 +13804,46 @@ entry:
 
 
 define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtb %ymm1, %ymm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtb %ymm1, %ymm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi516:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi517:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi518:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %ecx
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    shlq $32, %rax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <32 x i8>
   %1 = bitcast <4 x i64> %__b to <32 x i8>
@@ -3221,12 +13854,46 @@ entry:
 }
 
 define zeroext i64 @test_vpcmpsgtb_v32i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtb (%rdi), %ymm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtb (%rdi), %ymm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi519:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi520:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi521:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpgtb (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %ecx
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    shlq $32, %rax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <32 x i8>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -3238,13 +13905,56 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask(i32 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtb %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtb %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi522:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi523:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi524:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $96, %rsp
+; NoVLX-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vpmovdb %zmm2, %xmm2
+; NoVLX-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
+; NoVLX-NEXT:    vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT:    vpxord %zmm4, %zmm4, %zmm4
+; NoVLX-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %ecx
+; NoVLX-NEXT:    vptestmd %zmm4, %zmm4, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    shlq $32, %rax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <32 x i8>
   %1 = bitcast <4 x i64> %__b to <32 x i8>
@@ -3257,13 +13967,56 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem(i32 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtb (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtb (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi525:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi526:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi527:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $96, %rsp
+; NoVLX-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
+; NoVLX-NEXT:    vpmovdb %zmm2, %xmm2
+; NoVLX-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; NoVLX-NEXT:    vpcmpgtb (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; NoVLX-NEXT:    vpand %xmm2, %xmm4, %xmm2
+; NoVLX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %ecx
+; NoVLX-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    shlq $32, %rax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <32 x i8>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -3278,11 +14031,24 @@ entry:
 
 
 define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtw %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtw %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <8 x i16>
   %1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -3293,11 +14059,24 @@ entry:
 }
 
 define zeroext i16 @test_vpcmpsgtw_v8i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtw (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtw (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v16i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtw (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <8 x i16>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -3309,12 +14088,26 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtw %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtw %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <8 x i16>
   %1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -3327,12 +14120,26 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v16i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtw (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kunpckbw %k0, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <8 x i16>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -3347,11 +14154,72 @@ entry:
 
 
 define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtw %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtw %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi528:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi529:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi530:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <8 x i16>
   %1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -3362,11 +14230,72 @@ entry:
 }
 
 define zeroext i32 @test_vpcmpsgtw_v8i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtw (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtw (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi531:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi532:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi533:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpgtw (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <8 x i16>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -3378,12 +14307,74 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtw %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtw %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi534:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi535:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi536:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <8 x i16>
   %1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -3396,12 +14387,74 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi537:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi538:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi539:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpgtw (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <8 x i16>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -3416,11 +14469,77 @@ entry:
 
 
 define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtw %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtw %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi540:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi541:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi542:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <8 x i16>
   %1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -3431,11 +14550,77 @@ entry:
 }
 
 define zeroext i64 @test_vpcmpsgtw_v8i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtw (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtw (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi543:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi544:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi545:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpgtw (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <8 x i16>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -3447,12 +14632,79 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtw %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtw %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi546:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi547:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi548:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <8 x i16>
   %1 = bitcast <2 x i64> %__b to <8 x i16>
@@ -3465,12 +14717,79 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtw (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtw (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi549:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi550:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi551:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpgtw (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <8 x i16>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -3485,12 +14804,123 @@ entry:
 
 
 define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtw %ymm1, %ymm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi552:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi553:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi554:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi555:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi556:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi557:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi558:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi559:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <16 x i16>
   %1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -3501,12 +14931,123 @@ entry:
 }
 
 define zeroext i32 @test_vpcmpsgtw_v16i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtw (%rdi), %ymm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtw (%rdi), %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi560:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi561:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi562:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi563:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi564:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi565:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi566:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi567:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpgtw (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <16 x i16>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -3518,13 +15059,125 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtw %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi568:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi569:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi570:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi571:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi572:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi573:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi574:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi575:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <16 x i16>
   %1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -3537,13 +15190,125 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtw (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtw (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi576:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi577:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi578:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi579:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi580:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi581:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi582:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi583:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpgtw (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <16 x i16>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -3558,12 +15323,128 @@ entry:
 
 
 define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtw %ymm1, %ymm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi584:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi585:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi586:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi587:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi588:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi589:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi590:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi591:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <16 x i16>
   %1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -3574,12 +15455,128 @@ entry:
 }
 
 define zeroext i64 @test_vpcmpsgtw_v16i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtw (%rdi), %ymm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtw (%rdi), %ymm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi592:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi593:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi594:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi595:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi596:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi597:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi598:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi599:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpgtw (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <16 x i16>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -3591,13 +15588,130 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask(i16 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtw %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi600:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi601:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi602:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi603:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi604:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi605:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi606:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi607:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <16 x i16>
   %1 = bitcast <4 x i64> %__b to <16 x i16>
@@ -3610,13 +15724,130 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem(i16 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtw (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtw (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi608:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi609:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi610:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi611:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi612:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi613:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi614:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi615:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpgtw (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <16 x i16>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -3631,12 +15862,348 @@ entry:
 
 
 define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi616:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi617:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi618:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT:    vmovq %xmm3, %rax
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    movq %rax, %rdx
+; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm5
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm8
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm6
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm7
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm2, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm2
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm7, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm6, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm7
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm6
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm8, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm3
+; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm1
+; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %ecx
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    shlq $32, %rax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <32 x i16>
   %1 = bitcast <8 x i64> %__b to <32 x i16>
@@ -3647,12 +16214,263 @@ entry:
 }
 
 define zeroext i64 @test_vpcmpsgtw_v32i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtw (%rdi), %zmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtw (%rdi), %zmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi619:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi620:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi621:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rax
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    movq %rax, %rdx
+; NoVLX-NEXT:    vmovd %eax, %xmm1
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm3
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; NoVLX-NEXT:    vpcmpgtw 32(%rdi), %ymm1, %ymm1
+; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %eax, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpcmpgtw (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %ecx
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    shlq $32, %rax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <32 x i16>
   %load = load <8 x i64>, <8 x i64>* %__b
@@ -3664,13 +16482,358 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask(i32 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi622:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi623:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi624:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $96, %rsp
+; NoVLX-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rax
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    movq %rax, %rdx
+; NoVLX-NEXT:    vmovd %eax, %xmm3
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm4
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm8
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm6
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm2
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm3, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm9
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm6, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm6
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm7, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm5, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm7
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm8, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vinserti128 $1, %xmm9, %ymm4, %ymm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm8
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vinserti128 $1, %xmm7, %ymm3, %ymm3
+; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT:    vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z}
+; NoVLX-NEXT:    vpcmpgtw %ymm3, %ymm1, %ymm4
+; NoVLX-NEXT:    vpmovdb %zmm6, %xmm6
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm2
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm2, %ymm2
+; NoVLX-NEXT:    vpcmpgtw %ymm2, %ymm8, %ymm2
+; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm3
+; NoVLX-NEXT:    vpmovsxwd %ymm4, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpand %xmm6, %xmm2, %xmm2
+; NoVLX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpand %xmm0, %xmm3, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %ecx
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    shlq $32, %rax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <32 x i16>
   %1 = bitcast <8 x i64> %__b to <32 x i16>
@@ -3683,13 +16846,273 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem(i32 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtw (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtw (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi625:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi626:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi627:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $96, %rsp
+; NoVLX-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; NoVLX-NEXT:    vmovq %xmm1, %rax
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    movq %rax, %rdx
+; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
+; NoVLX-NEXT:    shrq $32, %rdx
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm2
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm3, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm4
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm3
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm1, %xmm0
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm5
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vpmovdb %zmm0, %xmm1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm4
+; NoVLX-NEXT:    vpmovdb %zmm0, %xmm2
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm5, %ymm3
+; NoVLX-NEXT:    vpcmpgtw (%rsi), %ymm3, %ymm3
+; NoVLX-NEXT:    vpmovsxwd %ymm3, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm3
+; NoVLX-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %eax, %xmm3
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpcmpgtw 32(%rsi), %ymm4, %ymm4
+; NoVLX-NEXT:    vpmovsxwd %ymm4, %zmm4
+; NoVLX-NEXT:    vpslld $31, %zmm4, %zmm4
+; NoVLX-NEXT:    vptestmd %zmm4, %zmm4, %k0
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpand %xmm2, %xmm4, %xmm2
+; NoVLX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpand %xmm1, %xmm3, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %ecx
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    shlq $32, %rax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <32 x i16>
   %load = load <8 x i64>, <8 x i64>* %__b
@@ -3704,11 +17127,51 @@ entry:
 
 
 define zeroext i8 @test_vpcmpsgtd_v4i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtd %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -3719,11 +17182,51 @@ entry:
 }
 
 define zeroext i8 @test_vpcmpsgtd_v4i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtd (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtd (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -3735,12 +17238,70 @@ entry:
 }
 
 define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -3754,12 +17315,70 @@ entry:
 }
 
 define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -3775,11 +17394,52 @@ entry:
 
 
 define zeroext i8 @test_vpcmpsgtd_v4i1_v8i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtd (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtd (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v8i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load i32, i32* %__b
@@ -3792,12 +17452,71 @@ entry:
 }
 
 define zeroext i8 @test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load i32, i32* %__b
@@ -3814,11 +17533,50 @@ entry:
 
 
 define zeroext i16 @test_vpcmpsgtd_v4i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtd %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -3829,11 +17587,50 @@ entry:
 }
 
 define zeroext i16 @test_vpcmpsgtd_v4i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtd (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtd (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -3845,12 +17642,69 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -3864,12 +17718,69 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -3885,11 +17796,51 @@ entry:
 
 
 define zeroext i16 @test_vpcmpsgtd_v4i1_v16i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtd (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtd (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v16i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load i32, i32* %__b
@@ -3902,12 +17853,70 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load i32, i32* %__b
@@ -3924,11 +17933,39 @@ entry:
 
 
 define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtd %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi628:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi629:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi630:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -3939,11 +17976,39 @@ entry:
 }
 
 define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtd (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtd (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi631:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi632:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi633:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpgtd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -3955,12 +18020,58 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi634:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi635:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi636:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -3974,12 +18085,58 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi637:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi638:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi639:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpgtd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -3995,11 +18152,40 @@ entry:
 
 
 define zeroext i32 @test_vpcmpsgtd_v4i1_v32i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtd (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtd (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi640:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi641:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi642:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load i32, i32* %__b
@@ -4012,12 +18198,59 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi643:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi644:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi645:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load i32, i32* %__b
@@ -4034,11 +18267,46 @@ entry:
 
 
 define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtd %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi646:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi647:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi648:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -4049,11 +18317,46 @@ entry:
 }
 
 define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtd (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtd (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi649:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi650:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi651:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpgtd (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -4065,12 +18368,65 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi652:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi653:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi654:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %1 = bitcast <2 x i64> %__b to <4 x i32>
@@ -4084,12 +18440,65 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtd (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi655:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi656:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi657:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpgtd (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -4105,11 +18514,47 @@ entry:
 
 
 define zeroext i64 @test_vpcmpsgtd_v4i1_v64i1_mask_mem_b(<2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtd (%rdi){1to4}, %xmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtd (%rdi){1to4}, %xmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi658:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi659:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi660:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpbroadcastd (%rdi), %xmm1
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load i32, i32* %__b
@@ -4122,12 +18567,66 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtd (%rsi){1to4}, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi661:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi662:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi663:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <4 x i32>
   %load = load i32, i32* %__b
@@ -4144,21 +18643,23 @@ entry:
 
 
 define zeroext i16 @test_vpcmpsgtd_v8i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
 ;
 ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask:
-; NoVLX:       ## BB#0: ## %entry
-; NoVLX-NEXT:    ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -4170,21 +18671,23 @@ entry:
 }
 
 define zeroext i16 @test_vpcmpsgtd_v8i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtd (%rdi), %ymm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtd (%rdi), %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
 ;
 ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem:
-; NoVLX:       ## BB#0: ## %entry
-; NoVLX-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -4197,23 +18700,25 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpsgtd_v8i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask:
-; NoVLX:       ## BB#0: ## %entry
-; NoVLX-NEXT:    ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
-; NoVLX-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -4227,23 +18732,25 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtd (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem:
-; NoVLX:       ## BB#0: ## %entry
-; NoVLX-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -4259,21 +18766,23 @@ entry:
 
 
 define zeroext i16 @test_vpcmpsgtd_v8i1_v16i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtd (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtd (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
 ;
 ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v16i1_mask_mem_b:
-; NoVLX:       ## BB#0: ## %entry
-; NoVLX-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -4287,23 +18796,25 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v16i1_mask_mem_b:
-; NoVLX:       ## BB#0: ## %entry
-; NoVLX-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
 ; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
@@ -4320,28 +18831,148 @@ entry:
 
 
 define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
-entry:
-  %0 = bitcast <4 x i64> %__a to <8 x i32>
-  %1 = bitcast <4 x i64> %__b to <8 x i32>
-  %2 = icmp sgt <8 x i32> %0, %1
+; VLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi664:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi665:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi666:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
+entry:
+  %0 = bitcast <4 x i64> %__a to <8 x i32>
+  %1 = bitcast <4 x i64> %__b to <8 x i32>
+  %2 = icmp sgt <8 x i32> %0, %1
   %3 = shufflevector <8 x i1> %2, <8 x i1> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %4 = bitcast <32 x i1> %3 to i32
   ret i32 %4
 }
 
 define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtd (%rdi), %ymm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtd (%rdi), %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi667:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi668:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi669:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -4353,13 +18984,75 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi670:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi671:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi672:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    kandw %k1, %k0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
   %1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -4372,13 +19065,75 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtd (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi673:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi674:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi675:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    kandw %k1, %k0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -4393,12 +19148,72 @@ entry:
 
 
 define zeroext i32 @test_vpcmpsgtd_v8i1_v32i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtd (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtd (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi676:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi677:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi678:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vpbroadcastd (%rdi), %ymm1
+; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
   %load = load i32, i32* %__b
@@ -4411,13 +19226,75 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi679:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi680:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi681:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vpbroadcastd (%rsi), %ymm1
+; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    kandw %k0, %k1, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
   %load = load i32, i32* %__b
@@ -4433,12 +19310,77 @@ entry:
 
 
 define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi682:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi683:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi684:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
   %1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -4449,12 +19391,77 @@ entry:
 }
 
 define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtd (%rdi), %ymm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtd (%rdi), %ymm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi685:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi686:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi687:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
+; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -4466,13 +19473,80 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi688:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi689:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi690:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    kandw %k1, %k0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
   %1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -4485,13 +19559,80 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtd (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtd (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi691:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi692:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi693:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    kandw %k1, %k0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -4506,12 +19647,77 @@ entry:
 
 
 define zeroext i64 @test_vpcmpsgtd_v8i1_v64i1_mask_mem_b(<4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtd (%rdi){1to8}, %ymm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtd (%rdi){1to8}, %ymm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi694:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi695:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi696:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vpbroadcastd (%rdi), %ymm1
+; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
   %load = load i32, i32* %__b
@@ -4524,13 +19730,80 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtd (%rsi){1to8}, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi697:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi698:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi699:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NoVLX-NEXT:    vpbroadcastd (%rsi), %ymm1
+; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    kandw %k0, %k1, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
   %load = load i32, i32* %__b
@@ -4546,12 +19819,120 @@ entry:
 
 
 define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi700:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi701:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi702:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi703:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi704:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi705:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi706:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi707:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <16 x i32>
   %1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -4562,12 +19943,120 @@ entry:
 }
 
 define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtd (%rdi), %zmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtd (%rdi), %zmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi708:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi709:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi710:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi711:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi712:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi713:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi714:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi715:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpgtd (%rdi), %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <16 x i32>
   %load = load <8 x i64>, <8 x i64>* %__b
@@ -4579,13 +20068,122 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi716:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi717:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi718:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi719:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi720:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi721:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi722:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi723:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <16 x i32>
   %1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -4598,13 +20196,122 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi724:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi725:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi726:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi727:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi728:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi729:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi730:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi731:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <16 x i32>
   %load = load <8 x i64>, <8 x i64>* %__b
@@ -4619,12 +20326,120 @@ entry:
 
 
 define zeroext i32 @test_vpcmpsgtd_v16i1_v32i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtd (%rdi){1to16}, %zmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtd (%rdi){1to16}, %zmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi732:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi733:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi734:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi735:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi736:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi737:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi738:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi739:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpgtd (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <16 x i32>
   %load = load i32, i32* %__b
@@ -4637,13 +20452,122 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi740:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi741:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi742:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:  .Lcfi743:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi744:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi745:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi746:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi747:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <16 x i32>
   %load = load i32, i32* %__b
@@ -4659,12 +20583,125 @@ entry:
 
 
 define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi748:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi749:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi750:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi751:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi752:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi753:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi754:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi755:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <16 x i32>
   %1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -4675,12 +20712,125 @@ entry:
 }
 
 define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtd (%rdi), %zmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtd (%rdi), %zmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi756:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi757:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi758:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi759:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi760:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi761:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi762:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi763:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpgtd (%rdi), %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <16 x i32>
   %load = load <8 x i64>, <8 x i64>* %__b
@@ -4692,13 +20842,127 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask(i16 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi764:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi765:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi766:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi767:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi768:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi769:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi770:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi771:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <16 x i32>
   %1 = bitcast <8 x i64> %__b to <16 x i32>
@@ -4711,13 +20975,127 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem(i16 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi772:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi773:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi774:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi775:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi776:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi777:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi778:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi779:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <16 x i32>
   %load = load <8 x i64>, <8 x i64>* %__b
@@ -4732,12 +21110,125 @@ entry:
 
 
 define zeroext i64 @test_vpcmpsgtd_v16i1_v64i1_mask_mem_b(<8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtd (%rdi){1to16}, %zmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtd (%rdi){1to16}, %zmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi780:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi781:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi782:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi783:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi784:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi785:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi786:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi787:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    vpcmpgtd (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <16 x i32>
   %load = load i32, i32* %__b
@@ -4750,13 +21241,127 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b(i16 zeroext %__u, <8 x i64> %__a, i32* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi788:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi789:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi790:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    pushq %r15
+; NoVLX-NEXT:    pushq %r14
+; NoVLX-NEXT:    pushq %r13
+; NoVLX-NEXT:    pushq %r12
+; NoVLX-NEXT:    pushq %rbx
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:  .Lcfi791:
+; NoVLX-NEXT:    .cfi_offset %rbx, -56
+; NoVLX-NEXT:  .Lcfi792:
+; NoVLX-NEXT:    .cfi_offset %r12, -48
+; NoVLX-NEXT:  .Lcfi793:
+; NoVLX-NEXT:    .cfi_offset %r13, -40
+; NoVLX-NEXT:  .Lcfi794:
+; NoVLX-NEXT:    .cfi_offset %r14, -32
+; NoVLX-NEXT:  .Lcfi795:
+; NoVLX-NEXT:    .cfi_offset %r15, -24
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vmovd %r10d, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    leaq -40(%rbp), %rsp
+; NoVLX-NEXT:    popq %rbx
+; NoVLX-NEXT:    popq %r12
+; NoVLX-NEXT:    popq %r13
+; NoVLX-NEXT:    popq %r14
+; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <16 x i32>
   %load = load i32, i32* %__b
@@ -4772,12 +21377,23 @@ entry:
 
 
 define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -4788,12 +21404,23 @@ entry:
 }
 
 define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -4805,13 +21432,34 @@ entry:
 }
 
 define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -4825,13 +21473,34 @@ entry:
 }
 
 define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -4847,12 +21516,24 @@ entry:
 
 
 define zeroext i4 @test_vpcmpsgtq_v2i1_v4i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load i64, i64* %__b
@@ -4865,13 +21546,35 @@ entry:
 }
 
 define zeroext i4 @test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovb %k0, -{{[0-9]+}}(%rsp)
+; VLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load i64, i64* %__b
@@ -4888,11 +21591,35 @@ entry:
 
 
 define zeroext i8 @test_vpcmpsgtq_v2i1_v8i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -4903,11 +21630,35 @@ entry:
 }
 
 define zeroext i8 @test_vpcmpsgtq_v2i1_v8i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -4919,12 +21670,46 @@ entry:
 }
 
 define zeroext i8 @test_masked_vpcmpsgtq_v2i1_v8i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -4938,12 +21723,46 @@ entry:
 }
 
 define zeroext i8 @test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -4959,11 +21778,36 @@ entry:
 
 
 define zeroext i8 @test_vpcmpsgtq_v2i1_v8i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v8i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load i64, i64* %__b
@@ -4976,12 +21820,47 @@ entry:
 }
 
 define zeroext i8 @test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v8i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load i64, i64* %__b
@@ -4998,11 +21877,34 @@ entry:
 
 
 define zeroext i16 @test_vpcmpsgtq_v2i1_v16i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -5013,11 +21915,34 @@ entry:
 }
 
 define zeroext i16 @test_vpcmpsgtq_v2i1_v16i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -5029,12 +21954,45 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpsgtq_v2i1_v16i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -5048,12 +22006,45 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -5069,11 +22060,35 @@ entry:
 
 
 define zeroext i16 @test_vpcmpsgtq_v2i1_v16i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v16i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load i64, i64* %__b
@@ -5086,12 +22101,46 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v16i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load i64, i64* %__b
@@ -5108,11 +22157,39 @@ entry:
 
 
 define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi796:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi797:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi798:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -5123,11 +22200,39 @@ entry:
 }
 
 define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi799:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi800:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi801:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpgtq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -5139,12 +22244,50 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi802:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi803:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi804:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -5158,12 +22301,50 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi805:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi806:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi807:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpgtq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -5179,11 +22360,40 @@ entry:
 
 
 define zeroext i32 @test_vpcmpsgtq_v2i1_v32i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi808:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi809:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi810:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load i64, i64* %__b
@@ -5196,12 +22406,51 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi811:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi812:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi813:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load i64, i64* %__b
@@ -5218,11 +22467,46 @@ entry:
 
 
 define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask(<2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi814:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi815:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi816:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -5233,11 +22517,46 @@ entry:
 }
 
 define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem(<2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq (%rdi), %xmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq (%rdi), %xmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi817:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi818:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi819:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpgtq (%rdi), %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -5249,12 +22568,57 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask(i8 zeroext %__u, <2 x i64> %__a, <2 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi820:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi821:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi822:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %1 = bitcast <2 x i64> %__b to <2 x i64>
@@ -5268,12 +22632,57 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem(i8 zeroext %__u, <2 x i64> %__a, <2 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq (%rsi), %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi823:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi824:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi825:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpgtq (%rsi), %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load <2 x i64>, <2 x i64>* %__b
@@ -5289,11 +22698,47 @@ entry:
 
 
 define zeroext i64 @test_vpcmpsgtq_v2i1_v64i1_mask_mem_b(<2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq (%rdi){1to2}, %xmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq (%rdi){1to2}, %xmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi826:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi827:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi828:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpbroadcastq (%rdi), %xmm1
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load i64, i64* %__b
@@ -5306,12 +22751,58 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b(i8 zeroext %__u, <2 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq (%rsi){1to2}, %xmm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi829:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi830:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi831:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <2 x i64> %__a to <2 x i64>
   %load = load i64, i64* %__b
@@ -5328,12 +22819,53 @@ entry:
 
 
 define zeroext i8 @test_vpcmpsgtq_v4i1_v8i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq %ymm1, %ymm0, %k0
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -5344,12 +22876,53 @@ entry:
 }
 
 define zeroext i8 @test_vpcmpsgtq_v4i1_v8i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq (%rdi), %ymm0, %k0
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq (%rdi), %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtq (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -5361,13 +22934,72 @@ entry:
 }
 
 define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -5381,13 +23013,72 @@ entry:
 }
 
 define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtq (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -5403,12 +23094,54 @@ entry:
 
 
 define zeroext i8 @test_vpcmpsgtq_v4i1_v8i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v8i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load i64, i64* %__b
@@ -5421,13 +23154,73 @@ entry:
 }
 
 define zeroext i8 @test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kshiftlw $7, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $7, %k0, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7]
+; NoVLX-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpsllq $63, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load i64, i64* %__b
@@ -5444,12 +23237,52 @@ entry:
 
 
 define zeroext i16 @test_vpcmpsgtq_v4i1_v16i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq %ymm1, %ymm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -5460,12 +23293,52 @@ entry:
 }
 
 define zeroext i16 @test_vpcmpsgtq_v4i1_v16i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq (%rdi), %ymm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq (%rdi), %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtq (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -5477,13 +23350,71 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -5497,13 +23428,71 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtq (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -5519,12 +23508,53 @@ entry:
 
 
 define zeroext i16 @test_vpcmpsgtq_v4i1_v16i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v16i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load i64, i64* %__b
@@ -5537,13 +23567,72 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $0, %xmm0, %eax
+; NoVLX-NEXT:    andl $1, %eax
+; NoVLX-NEXT:    kmovw %eax, %k0
+; NoVLX-NEXT:    kxorw %k0, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $1, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $1, %k1, %k1
+; NoVLX-NEXT:    korw %k0, %k1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,16,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; NoVLX-NEXT:    vpextrb $12, %xmm0, %eax
+; NoVLX-NEXT:    kmovw %eax, %k1
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,16,4,5,6,7,8,9,10,11,12,13,14,15]
+; NoVLX-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load i64, i64* %__b
@@ -5560,12 +23649,41 @@ entry:
 
 
 define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq %ymm1, %ymm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi832:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi833:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi834:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -5576,12 +23694,41 @@ entry:
 }
 
 define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq (%rdi), %ymm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq (%rdi), %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi835:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi836:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi837:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpgtq (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -5593,13 +23740,60 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi838:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi839:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi840:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -5613,13 +23807,60 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi841:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi842:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi843:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpgtq (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -5635,12 +23876,42 @@ entry:
 
 
 define zeroext i32 @test_vpcmpsgtq_v4i1_v32i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi844:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi845:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi846:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load i64, i64* %__b
@@ -5653,13 +23924,61 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi847:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi848:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi849:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load i64, i64* %__b
@@ -5676,12 +23995,48 @@ entry:
 
 
 define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask(<4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq %ymm1, %ymm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi850:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi851:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi852:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -5692,12 +24047,48 @@ entry:
 }
 
 define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem(<4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq (%rdi), %ymm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq (%rdi), %ymm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi853:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi854:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi855:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpgtq (%rdi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -5709,13 +24100,67 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask(i8 zeroext %__u, <4 x i64> %__a, <4 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi856:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi857:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi858:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %1 = bitcast <4 x i64> %__b to <4 x i64>
@@ -5729,13 +24174,67 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem(i8 zeroext %__u, <4 x i64> %__a, <4 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq (%rsi), %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi859:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi860:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi861:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpgtq (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load <4 x i64>, <4 x i64>* %__b
@@ -5751,12 +24250,49 @@ entry:
 
 
 define zeroext i64 @test_vpcmpsgtq_v4i1_v64i1_mask_mem_b(<4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq (%rdi){1to4}, %ymm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq (%rdi){1to4}, %ymm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi862:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi863:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi864:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpbroadcastq (%rdi), %ymm1
+; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load i64, i64* %__b
@@ -5769,13 +24305,68 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b(i8 zeroext %__u, <4 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq (%rsi){1to4}, %ymm0, %k0 {%k1}
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi865:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi866:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi867:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
+; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k0
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
+; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
+; NoVLX-NEXT:    kshiftrw $15, %k3, %k3
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kmovw %k3, %ecx
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k2, %eax
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <4 x i64>
   %load = load i64, i64* %__b
@@ -5792,12 +24383,20 @@ entry:
 
 
 define zeroext i16 @test_vpcmpsgtq_v8i1_v16i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -5808,12 +24407,20 @@ entry:
 }
 
 define zeroext i16 @test_vpcmpsgtq_v8i1_v16i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq (%rdi), %zmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq (%rdi), %zmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtq (%rdi), %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %load = load <8 x i64>, <8 x i64>* %__b
@@ -5825,13 +24432,22 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpsgtq_v8i1_v16i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -5844,13 +24460,22 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %load = load <8 x i64>, <8 x i64>* %__b
@@ -5865,12 +24490,20 @@ entry:
 
 
 define zeroext i16 @test_vpcmpsgtq_v8i1_v16i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v16i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    vpcmpgtq (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %load = load i64, i64* %__b
@@ -5883,13 +24516,22 @@ entry:
 }
 
 define zeroext i16 @test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v16i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %load = load i64, i64* %__b
@@ -5905,12 +24547,70 @@ entry:
 
 
 define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi868:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi869:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi870:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -5921,12 +24621,70 @@ entry:
 }
 
 define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq (%rdi), %zmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq (%rdi), %zmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi871:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi872:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi873:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpgtq (%rdi), %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %load = load <8 x i64>, <8 x i64>* %__b
@@ -5938,13 +24696,72 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi874:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi875:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi876:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -5957,13 +24774,72 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem(i8 zeroext %__u, <8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi877:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi878:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi879:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %load = load <8 x i64>, <8 x i64>* %__b
@@ -5978,12 +24854,70 @@ entry:
 
 
 define zeroext i32 @test_vpcmpsgtq_v8i1_v32i1_mask_mem_b(<8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq (%rdi){1to8}, %zmm0, %k0
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq (%rdi){1to8}, %zmm0, %k0
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi880:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi881:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi882:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    vpcmpgtq (%rdi){1to8}, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %load = load i64, i64* %__b
@@ -5996,13 +24930,72 @@ entry:
 }
 
 define zeroext i32 @test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b(i8 zeroext %__u, <8 x i64> %__a, i64* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovd %k0, %eax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    kmovd %edi, %k1
+; VLX-NEXT:    vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; VLX-NEXT:    kmovd %k0, %eax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi883:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi884:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi885:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $32, %rsp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %load = load i64, i64* %__b
@@ -6018,12 +25011,75 @@ entry:
 
 
 define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask(<8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi886:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi887:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi888:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %1 = bitcast <8 x i64> %__b to <8 x i64>
@@ -6034,12 +25090,75 @@ entry:
 }
 
 define zeroext i64 @test_vpcmpsgtq_v8i1_v64i1_mask_mem(<8 x i64> %__a, <8 x i64>* %__b) local_unnamed_addr {
-; CHECK-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    vpcmpgtq (%rdi), %zmm0, %k0
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
+; VLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem:
+; VLX:       # BB#0: # %entry
+; VLX-NEXT:    vpcmpgtq (%rdi), %zmm0, %k0
+; VLX-NEXT:    kmovq %k0, %rax
+; VLX-NEXT:    vzeroupper
+; VLX-NEXT:    retq
+;
+; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem:
+; NoVLX:       # BB#0: # %entry
+; NoVLX-NEXT:    pushq %rbp
+; NoVLX-NEXT:  .Lcfi889:
+; NoVLX-NEXT:    .cfi_def_cfa_offset 16
+; NoVLX-NEXT:  .Lcfi890:
+; NoVLX-NEXT:    .cfi_offset %rbp, -16
+; NoVLX-NEXT:    movq %rsp, %rbp
+; NoVLX-NEXT:  .Lcfi891:
+; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
+; NoVLX-NEXT:    andq $-32, %rsp
+; NoVLX-NEXT:    subq $64, %rsp
+; NoVLX-NEXT:    vpcmpgtq (%rdi), %zmm0, %k0
+; NoVLX-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, (%rsp)
+; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; NoVLX-NEXT:    shlq $32, %rcx
+; NoVLX-NEXT:    movl (%rsp), %eax
+; NoVLX-NEXT:    orq %rcx, %rax
+; NoVLX-NEXT:    movq %rbp, %rsp
+; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i64> %__a to <8 x i64>
   %load = load <8 x i64>, <8 x i64>* %__b
@@ -6051,13 +25170,77 @@ entry:
 }
 
 define zeroext i64 @test_masked_vpcmpsgtq_v8i1_v64i1_mask(i8 zeroext %__u, <8 x i64> %__a, <8 x i64> %__b) local_unnamed_addr {
-; CHECK-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask:
-; CHECK:       ## BB#0: ## %entry
-; CHECK-NEXT:    kmovd %edi, %k1
-; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovq %k0, %rax
-; CHECK-NEXT:    vzeroupper

[... 30710 lines stripped ...]



More information about the llvm-commits mailing list