[llvm] r323201 - [X86] Legalize v32i1 without BWI via splitting to v16i1 rather than the default of promoting to v32i8.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Tue Jan 23 06:25:39 PST 2018


Modified: llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll?rev=323201&r1=323200&r2=323201&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll Tue Jan 23 06:25:39 2018
@@ -11,22 +11,10 @@ define zeroext i32 @test_vpcmpeqb_v16i1_
 ;
 ; NoVLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -47,22 +35,10 @@ define zeroext i32 @test_vpcmpeqb_v16i1_
 ;
 ; NoVLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpeqb (%rdi), %xmm0, %xmm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -85,25 +61,11 @@ define zeroext i32 @test_masked_vpcmpeqb
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -127,25 +89,11 @@ define zeroext i32 @test_masked_vpcmpeqb
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpeqb (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -170,27 +118,11 @@ define zeroext i64 @test_vpcmpeqb_v16i1_
 ;
 ; NoVLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -211,27 +143,11 @@ define zeroext i64 @test_vpcmpeqb_v16i1_
 ;
 ; NoVLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqb (%rdi), %xmm0, %xmm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -254,30 +170,12 @@ define zeroext i64 @test_masked_vpcmpeqb
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -301,30 +199,12 @@ define zeroext i64 @test_masked_vpcmpeqb
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqb (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -350,30 +230,16 @@ define zeroext i64 @test_vpcmpeqb_v32i1_
 ;
 ; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -395,30 +261,16 @@ define zeroext i64 @test_vpcmpeqb_v32i1_
 ;
 ; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqb (%rdi), %ymm0, %ymm0
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -442,41 +294,19 @@ define zeroext i64 @test_masked_vpcmpeqb
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $96, %rsp
-; NoVLX-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm2, %xmm2
-; NoVLX-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    shrl $16, %edi
 ; NoVLX-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k2
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -501,41 +331,19 @@ define zeroext i64 @test_masked_vpcmpeqb
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $96, %rsp
-; NoVLX-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm2, %xmm2
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    shrl $16, %edi
 ; NoVLX-NEXT:    vpcmpeqb (%rsi), %ymm0, %ymm0
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; NoVLX-NEXT:    vpand %xmm2, %xmm3, %xmm2
-; NoVLX-NEXT:    vpmovsxbd %xmm2, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k2
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -675,49 +483,10 @@ define zeroext i32 @test_vpcmpeqw_v8i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -738,49 +507,10 @@ define zeroext i32 @test_vpcmpeqw_v8i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpeqw (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -803,50 +533,11 @@ define zeroext i32 @test_masked_vpcmpeqw
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -870,50 +561,11 @@ define zeroext i32 @test_masked_vpcmpeqw
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpeqw (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -938,54 +590,11 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -1006,54 +615,11 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqw (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -1076,55 +642,12 @@ define zeroext i64 @test_masked_vpcmpeqw
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -1148,55 +671,12 @@ define zeroext i64 @test_masked_vpcmpeqw
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqw (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -1222,22 +702,10 @@ define zeroext i32 @test_vpcmpeqw_v16i1_
 ;
 ; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -1259,22 +727,10 @@ define zeroext i32 @test_vpcmpeqw_v16i1_
 ;
 ; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpeqw (%rdi), %ymm0, %ymm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -1298,25 +754,11 @@ define zeroext i32 @test_masked_vpcmpeqw
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -1341,25 +783,11 @@ define zeroext i32 @test_masked_vpcmpeqw
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpeqw (%rsi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -1385,27 +813,11 @@ define zeroext i64 @test_vpcmpeqw_v16i1_
 ;
 ; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -1427,27 +839,11 @@ define zeroext i64 @test_vpcmpeqw_v16i1_
 ;
 ; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqw (%rdi), %ymm0, %ymm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -1471,30 +867,12 @@ define zeroext i64 @test_masked_vpcmpeqw
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -1519,30 +897,12 @@ define zeroext i64 @test_masked_vpcmpeqw
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqw (%rsi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -1568,30 +928,43 @@ define zeroext i64 @test_vpcmpeqw_v32i1_
 ;
 ; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; NoVLX-NEXT:    vmovq %xmm3, %rax
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    vmovd %eax, %xmm3
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm5
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm8
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
-; NoVLX-NEXT:    vextracti128 $1, %ymm1, %xmm6
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm7
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm4
+; NoVLX-NEXT:    vextracti128 $1, %ymm1, %xmm8
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm6
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm2
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm3, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm9
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
@@ -1599,109 +972,109 @@ define zeroext i64 @test_vpcmpeqw_v32i1_
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    vmovq %xmm6, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm4
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm2, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm6
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    vmovq %xmm7, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm6, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm5, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm7
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm7
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm5, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm8, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm6
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm5
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm5, %xmm5
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    vmovq %xmm1, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm2
@@ -1709,7 +1082,11 @@ define zeroext i64 @test_vpcmpeqw_v32i1_
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vinserti128 $1, %xmm9, %ymm4, %ymm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm7, %ymm3, %ymm3
+; NoVLX-NEXT:    vpcmpeqw %ymm3, %ymm1, %ymm1
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
@@ -1717,51 +1094,20 @@ define zeroext i64 @test_vpcmpeqw_v32i1_
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm8, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm3
-; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm3, %ymm1
-; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm2, %ymm2
+; NoVLX-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -1783,67 +1129,60 @@ define zeroext i64 @test_vpcmpeqw_v32i1_
 ;
 ; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; NoVLX-NEXT:    vmovq %xmm2, %rax
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; NoVLX-NEXT:    vmovq %xmm1, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm1
+; NoVLX-NEXT:    vmovd %eax, %xmm2
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm3
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
@@ -1851,7 +1190,7 @@ define zeroext i64 @test_vpcmpeqw_v32i1_
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
@@ -1861,37 +1200,30 @@ define zeroext i64 @test_vpcmpeqw_v32i1_
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpeqw 32(%rdi), %ymm1, %ymm1
-; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpcmpeqw (%rdi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vpcmpeqw 32(%rdi), %ymm1, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -1915,51 +1247,23 @@ define zeroext i64 @test_masked_vpcmpeqw
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $96, %rsp
-; NoVLX-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; NoVLX-NEXT:    vmovq %xmm2, %rax
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; NoVLX-NEXT:    vmovq %xmm3, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm3
+; NoVLX-NEXT:    vmovd %eax, %xmm2
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm4
-; NoVLX-NEXT:    vextracti128 $1, %ymm1, %xmm8
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm6
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm5
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm8
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
+; NoVLX-NEXT:    vextracti128 $1, %ymm1, %xmm6
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm7
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm3, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm9
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
 ; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
@@ -1967,109 +1271,109 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm6, %rcx
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm2, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm6
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vmovq %xmm7, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm5, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm6, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm7
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm7
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm8, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm1, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm6
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm5, %xmm5
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm2
@@ -2077,17 +1381,7 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
-; NoVLX-NEXT:    vinserti128 $1, %xmm9, %ymm4, %ymm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm0
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vinserti128 $1, %xmm7, %ymm3, %ymm3
-; NoVLX-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm4, %xmm4
-; NoVLX-NEXT:    vpcmpeqw %ymm3, %ymm1, %ymm1
-; NoVLX-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
@@ -2095,35 +1389,47 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm8, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm2, %ymm2
-; NoVLX-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm3
+; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
+; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    shrl $16, %edi
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm3, %ymm1
+; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k2
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -2148,68 +1454,60 @@ define zeroext i64 @test_masked_vpcmpeqw
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $96, %rsp
-; NoVLX-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; NoVLX-NEXT:    vmovq %xmm1, %rax
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    vmovd %eax, %xmm1
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm3
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm3, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
@@ -2217,7 +1515,7 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    vmovq %xmm1, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
@@ -2227,51 +1525,33 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm4, %xmm4
-; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; NoVLX-NEXT:    vpcmpeqw (%rsi), %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    shrl $16, %edi
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpeqw 32(%rsi), %ymm1, %ymm1
 ; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpcmpeqw (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k2
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -2678,37 +1958,12 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -2729,37 +1984,12 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -2782,38 +2012,13 @@ define zeroext i32 @test_masked_vpcmpeqd
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -2838,38 +2043,13 @@ define zeroext i32 @test_masked_vpcmpeqd
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -2895,37 +2075,12 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -2949,38 +2104,13 @@ define zeroext i32 @test_masked_vpcmpeqd
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -3007,42 +2137,13 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -3063,42 +2164,13 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -3121,43 +2193,14 @@ define zeroext i64 @test_masked_vpcmpeqd
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -3182,43 +2225,14 @@ define zeroext i64 @test_masked_vpcmpeqd
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -3244,42 +2258,13 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -3303,43 +2288,14 @@ define zeroext i64 @test_masked_vpcmpeqd
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -3561,49 +2517,12 @@ define zeroext i32 @test_vpcmpeqd_v8i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -3625,49 +2544,12 @@ define zeroext i32 @test_vpcmpeqd_v8i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -3691,50 +2573,13 @@ define zeroext i32 @test_masked_vpcmpeqd
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -3759,50 +2604,13 @@ define zeroext i32 @test_masked_vpcmpeqd
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -3828,49 +2636,12 @@ define zeroext i32 @test_vpcmpeqd_v8i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -3895,50 +2666,13 @@ define zeroext i32 @test_masked_vpcmpeqd
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -3965,54 +2699,13 @@ define zeroext i64 @test_vpcmpeqd_v8i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -4034,54 +2727,13 @@ define zeroext i64 @test_vpcmpeqd_v8i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -4105,57 +2757,16 @@ define zeroext i64 @test_masked_vpcmpeqd
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
-; NoVLX-NEXT:    vzeroupper
-; NoVLX-NEXT:    retq
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
+; NoVLX-NEXT:    vzeroupper
+; NoVLX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__a to <8 x i32>
   %1 = bitcast <4 x i64> %__b to <8 x i32>
@@ -4178,55 +2789,14 @@ define zeroext i64 @test_masked_vpcmpeqd
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -4252,54 +2822,13 @@ define zeroext i64 @test_vpcmpeqd_v8i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -4324,55 +2853,14 @@ define zeroext i64 @test_masked_vpcmpeqd
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -4399,22 +2887,8 @@ define zeroext i32 @test_vpcmpeqd_v16i1_
 ;
 ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -4436,22 +2910,8 @@ define zeroext i32 @test_vpcmpeqd_v16i1_
 ;
 ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vpcmpeqd (%rdi), %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpcmpeqd (%rdi), %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -4475,23 +2935,9 @@ define zeroext i32 @test_masked_vpcmpeqd
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -4516,23 +2962,9 @@ define zeroext i32 @test_masked_vpcmpeqd
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpeqd (%rsi), %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -4558,22 +2990,8 @@ define zeroext i32 @test_vpcmpeqd_v16i1_
 ;
 ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vpcmpeqd (%rdi){1to16}, %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpcmpeqd (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -4598,23 +3016,9 @@ define zeroext i32 @test_masked_vpcmpeqd
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpeqd (%rsi){1to16}, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -4641,27 +3045,9 @@ define zeroext i64 @test_vpcmpeqd_v16i1_
 ;
 ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -4683,27 +3069,9 @@ define zeroext i64 @test_vpcmpeqd_v16i1_
 ;
 ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vpcmpeqd (%rdi), %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpcmpeqd (%rdi), %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -4727,28 +3095,10 @@ define zeroext i64 @test_masked_vpcmpeqd
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -4773,28 +3123,10 @@ define zeroext i64 @test_masked_vpcmpeqd
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpeqd (%rsi), %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -4820,27 +3152,9 @@ define zeroext i64 @test_vpcmpeqd_v16i1_
 ;
 ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vpcmpeqd (%rdi){1to16}, %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpcmpeqd (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -4865,28 +3179,10 @@ define zeroext i64 @test_masked_vpcmpeqd
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpeqd (%rsi){1to16}, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -5485,31 +3781,12 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -5530,31 +3807,12 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -5577,32 +3835,13 @@ define zeroext i32 @test_masked_vpcmpeqq
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -5627,32 +3866,13 @@ define zeroext i32 @test_masked_vpcmpeqq
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -5678,31 +3898,12 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -5726,32 +3927,13 @@ define zeroext i32 @test_masked_vpcmpeqq
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -5778,36 +3960,13 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -5828,36 +3987,13 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -5880,37 +4016,14 @@ define zeroext i64 @test_masked_vpcmpeqq
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -5935,37 +4048,14 @@ define zeroext i64 @test_masked_vpcmpeqq
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -5991,36 +4081,13 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -6044,37 +4111,14 @@ define zeroext i64 @test_masked_vpcmpeqq
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -6496,37 +4540,12 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -6548,37 +4567,12 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -6602,38 +4596,13 @@ define zeroext i32 @test_masked_vpcmpeqq
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -6659,38 +4628,13 @@ define zeroext i32 @test_masked_vpcmpeqq
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -6717,37 +4661,12 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -6772,38 +4691,13 @@ define zeroext i32 @test_masked_vpcmpeqq
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -6831,42 +4725,13 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -6888,42 +4753,13 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -6947,43 +4783,14 @@ define zeroext i64 @test_masked_vpcmpeqq
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -7009,43 +4816,14 @@ define zeroext i64 @test_masked_vpcmpeqq
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -7072,42 +4850,13 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -7132,43 +4881,14 @@ define zeroext i64 @test_masked_vpcmpeqq
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -7366,47 +5086,8 @@ define zeroext i32 @test_vpcmpeqq_v8i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -7428,47 +5109,8 @@ define zeroext i32 @test_vpcmpeqq_v8i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpeqq (%rdi), %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -7492,48 +5134,9 @@ define zeroext i32 @test_masked_vpcmpeqq
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -7558,48 +5161,9 @@ define zeroext i32 @test_masked_vpcmpeqq
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -7625,47 +5189,8 @@ define zeroext i32 @test_vpcmpeqq_v8i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpeqq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -7690,48 +5215,9 @@ define zeroext i32 @test_masked_vpcmpeqq
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -7758,52 +5244,9 @@ define zeroext i64 @test_vpcmpeqq_v8i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -7825,52 +5268,9 @@ define zeroext i64 @test_vpcmpeqq_v8i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqq (%rdi), %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -7894,53 +5294,10 @@ define zeroext i64 @test_masked_vpcmpeqq
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -7965,53 +5322,10 @@ define zeroext i64 @test_masked_vpcmpeqq
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -8037,52 +5351,9 @@ define zeroext i64 @test_vpcmpeqq_v8i1_v
 ;
 ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -8107,53 +5378,10 @@ define zeroext i64 @test_masked_vpcmpeqq
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -8179,22 +5407,10 @@ define zeroext i32 @test_vpcmpsgtb_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -8215,22 +5431,10 @@ define zeroext i32 @test_vpcmpsgtb_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpgtb (%rdi), %xmm0, %xmm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -8253,25 +5457,11 @@ define zeroext i32 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -8295,25 +5485,11 @@ define zeroext i32 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpgtb (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -8338,27 +5514,11 @@ define zeroext i64 @test_vpcmpsgtb_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -8379,27 +5539,11 @@ define zeroext i64 @test_vpcmpsgtb_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtb (%rdi), %xmm0, %xmm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -8422,30 +5566,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -8469,30 +5595,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtb (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -8518,30 +5626,16 @@ define zeroext i64 @test_vpcmpsgtb_v32i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -8563,30 +5657,16 @@ define zeroext i64 @test_vpcmpsgtb_v32i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtb (%rdi), %ymm0, %ymm0
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -8610,41 +5690,19 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $96, %rsp
-; NoVLX-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm2, %xmm2
-; NoVLX-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    shrl $16, %edi
 ; NoVLX-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k2
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -8669,41 +5727,19 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $96, %rsp
-; NoVLX-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm2, %xmm2
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    shrl $16, %edi
 ; NoVLX-NEXT:    vpcmpgtb (%rsi), %ymm0, %ymm0
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; NoVLX-NEXT:    vpand %xmm2, %xmm3, %xmm2
-; NoVLX-NEXT:    vpmovsxbd %xmm2, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k2
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -8843,49 +5879,10 @@ define zeroext i32 @test_vpcmpsgtw_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -8906,49 +5903,10 @@ define zeroext i32 @test_vpcmpsgtw_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpgtw (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -8971,50 +5929,11 @@ define zeroext i32 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -9038,50 +5957,11 @@ define zeroext i32 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpgtw (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -9106,54 +5986,11 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -9174,54 +6011,11 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtw (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -9244,55 +6038,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -9316,55 +6067,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtw (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -9390,22 +6098,10 @@ define zeroext i32 @test_vpcmpsgtw_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -9427,22 +6123,10 @@ define zeroext i32 @test_vpcmpsgtw_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpgtw (%rdi), %ymm0, %ymm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -9466,25 +6150,11 @@ define zeroext i32 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -9509,25 +6179,11 @@ define zeroext i32 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpgtw (%rsi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -9553,27 +6209,11 @@ define zeroext i64 @test_vpcmpsgtw_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -9595,27 +6235,11 @@ define zeroext i64 @test_vpcmpsgtw_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtw (%rdi), %ymm0, %ymm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -9639,30 +6263,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -9687,30 +6293,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtw (%rsi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -9736,30 +6324,63 @@ define zeroext i64 @test_vpcmpsgtw_v32i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; NoVLX-NEXT:    vmovq %xmm3, %rax
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    vmovd %eax, %xmm3
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm5
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm8
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
-; NoVLX-NEXT:    vextracti128 $1, %ymm1, %xmm6
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm7
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm4
+; NoVLX-NEXT:    vextracti128 $1, %ymm1, %xmm8
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm6
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm2
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm3, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm9
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm6, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
@@ -9769,17 +6390,17 @@ define zeroext i64 @test_vpcmpsgtw_v32i1
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm6
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
@@ -9787,69 +6408,69 @@ define zeroext i64 @test_vpcmpsgtw_v32i1
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm2, %rcx
+; NoVLX-NEXT:    vmovq %xmm7, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm7, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm5, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm7
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm5, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm6, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm8, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm7
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm5
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm5, %xmm5
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm5, %xmm5
 ; NoVLX-NEXT:    vmovq %xmm1, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm6
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm2
@@ -9858,26 +6479,10 @@ define zeroext i64 @test_vpcmpsgtw_v32i1
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vinserti128 $1, %xmm9, %ymm4, %ymm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm7, %ymm3, %ymm3
+; NoVLX-NEXT:    vpcmpgtw %ymm3, %ymm1, %ymm1
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
@@ -9885,51 +6490,20 @@ define zeroext i64 @test_vpcmpsgtw_v32i1
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm8, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm3
-; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm1
-; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm2, %ymm2
+; NoVLX-NEXT:    vpcmpgtw %ymm2, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -9951,67 +6525,60 @@ define zeroext i64 @test_vpcmpsgtw_v32i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; NoVLX-NEXT:    vmovq %xmm2, %rax
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; NoVLX-NEXT:    vmovq %xmm1, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm1
+; NoVLX-NEXT:    vmovd %eax, %xmm2
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm3
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
@@ -10019,7 +6586,7 @@ define zeroext i64 @test_vpcmpsgtw_v32i1
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
@@ -10029,37 +6596,30 @@ define zeroext i64 @test_vpcmpsgtw_v32i1
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtw 32(%rdi), %ymm1, %ymm1
-; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpcmpgtw (%rdi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vpcmpgtw 32(%rdi), %ymm1, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -10083,51 +6643,23 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $96, %rsp
-; NoVLX-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; NoVLX-NEXT:    vmovq %xmm2, %rax
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; NoVLX-NEXT:    vmovq %xmm3, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm3
+; NoVLX-NEXT:    vmovd %eax, %xmm2
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm4
-; NoVLX-NEXT:    vextracti128 $1, %ymm1, %xmm8
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm6
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm5
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm8
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
+; NoVLX-NEXT:    vextracti128 $1, %ymm1, %xmm6
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm7
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm3, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm9
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
 ; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
@@ -10135,109 +6667,109 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm6, %rcx
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm2, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm6
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vmovq %xmm7, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm5, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm6, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm7
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm7
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm8, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm1, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm6
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm5, %xmm5
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm2
@@ -10245,17 +6777,7 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
-; NoVLX-NEXT:    vinserti128 $1, %xmm9, %ymm4, %ymm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm0
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vinserti128 $1, %xmm7, %ymm3, %ymm3
-; NoVLX-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm4, %xmm4
-; NoVLX-NEXT:    vpcmpgtw %ymm3, %ymm1, %ymm1
-; NoVLX-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
@@ -10263,35 +6785,47 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm8, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm2, %ymm2
-; NoVLX-NEXT:    vpcmpgtw %ymm2, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm3
+; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    shrl $16, %edi
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm1
 ; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k2
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -10316,68 +6850,60 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $96, %rsp
-; NoVLX-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; NoVLX-NEXT:    vmovq %xmm1, %rax
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    vmovd %eax, %xmm1
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm3
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm3, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
@@ -10385,7 +6911,7 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    vmovq %xmm1, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
@@ -10395,51 +6921,33 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm4, %xmm4
-; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; NoVLX-NEXT:    vpcmpgtw (%rsi), %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    shrl $16, %edi
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtw 32(%rsi), %ymm1, %ymm1
 ; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpcmpgtw (%rsi), %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k2
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -10846,37 +7354,12 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -10897,37 +7380,12 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -10950,38 +7408,13 @@ define zeroext i32 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -11006,38 +7439,13 @@ define zeroext i32 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -11063,37 +7471,12 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -11117,38 +7500,13 @@ define zeroext i32 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -11175,42 +7533,13 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -11231,42 +7560,13 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -11289,43 +7589,14 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -11350,43 +7621,14 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -11412,42 +7654,13 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -11471,43 +7684,14 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -11729,49 +7913,12 @@ define zeroext i32 @test_vpcmpsgtd_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -11793,49 +7940,12 @@ define zeroext i32 @test_vpcmpsgtd_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -11859,50 +7969,13 @@ define zeroext i32 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -11927,50 +8000,13 @@ define zeroext i32 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -11996,49 +8032,12 @@ define zeroext i32 @test_vpcmpsgtd_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -12063,50 +8062,13 @@ define zeroext i32 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -12133,54 +8095,13 @@ define zeroext i64 @test_vpcmpsgtd_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -12202,54 +8123,13 @@ define zeroext i64 @test_vpcmpsgtd_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -12273,55 +8153,14 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -12346,55 +8185,14 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -12420,54 +8218,13 @@ define zeroext i64 @test_vpcmpsgtd_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -12492,55 +8249,14 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %ymm1
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -12567,22 +8283,8 @@ define zeroext i32 @test_vpcmpsgtd_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -12604,22 +8306,8 @@ define zeroext i32 @test_vpcmpsgtd_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vpcmpgtd (%rdi), %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpcmpgtd (%rdi), %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -12643,23 +8331,9 @@ define zeroext i32 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -12684,23 +8358,9 @@ define zeroext i32 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpgtd (%rsi), %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -12726,22 +8386,8 @@ define zeroext i32 @test_vpcmpsgtd_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vpcmpgtd (%rdi){1to16}, %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpcmpgtd (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -12766,23 +8412,9 @@ define zeroext i32 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpgtd (%rsi){1to16}, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -12809,27 +8441,9 @@ define zeroext i64 @test_vpcmpsgtd_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -12851,27 +8465,9 @@ define zeroext i64 @test_vpcmpsgtd_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vpcmpgtd (%rdi), %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpcmpgtd (%rdi), %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -12895,28 +8491,10 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -12941,28 +8519,10 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpgtd (%rsi), %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -12988,27 +8548,9 @@ define zeroext i64 @test_vpcmpsgtd_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vpcmpgtd (%rdi){1to16}, %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpcmpgtd (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -13033,28 +8575,10 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpgtd (%rsi){1to16}, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -13653,31 +9177,12 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -13698,31 +9203,12 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -13745,32 +9231,13 @@ define zeroext i32 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -13795,32 +9262,13 @@ define zeroext i32 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -13846,31 +9294,12 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -13894,32 +9323,13 @@ define zeroext i32 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -13946,36 +9356,13 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -13996,36 +9383,13 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -14048,37 +9412,14 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -14103,37 +9444,14 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -14159,36 +9477,13 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -14212,37 +9507,14 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -14664,37 +9936,12 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -14716,37 +9963,12 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -14770,38 +9992,13 @@ define zeroext i32 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -14827,38 +10024,13 @@ define zeroext i32 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -14885,37 +10057,12 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -14940,38 +10087,13 @@ define zeroext i32 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -14999,42 +10121,13 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -15056,42 +10149,13 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -15115,43 +10179,14 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -15177,43 +10212,14 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -15240,42 +10246,13 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -15300,43 +10277,14 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -15534,47 +10482,8 @@ define zeroext i32 @test_vpcmpsgtq_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -15596,47 +10505,8 @@ define zeroext i32 @test_vpcmpsgtq_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpgtq (%rdi), %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -15660,48 +10530,9 @@ define zeroext i32 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -15726,48 +10557,9 @@ define zeroext i32 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -15793,47 +10585,8 @@ define zeroext i32 @test_vpcmpsgtq_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpgtq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -15858,48 +10611,9 @@ define zeroext i32 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -15926,52 +10640,9 @@ define zeroext i64 @test_vpcmpsgtq_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -15993,52 +10664,9 @@ define zeroext i64 @test_vpcmpsgtq_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtq (%rdi), %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -16062,53 +10690,10 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -16133,53 +10718,10 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -16205,52 +10747,9 @@ define zeroext i64 @test_vpcmpsgtq_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -16275,53 +10774,10 @@ define zeroext i64 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -16347,25 +10803,13 @@ define zeroext i32 @test_vpcmpsgeb_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -16386,26 +10830,14 @@ define zeroext i32 @test_vpcmpsgeb_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vmovdqa (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -16428,28 +10860,14 @@ define zeroext i32 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -16473,13 +10891,6 @@ define zeroext i32 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
@@ -16487,15 +10898,8 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -16520,30 +10924,14 @@ define zeroext i64 @test_vpcmpsgeb_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -16564,31 +10952,15 @@ define zeroext i64 @test_vpcmpsgeb_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vmovdqa (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -16611,33 +10983,15 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -16661,13 +11015,6 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
@@ -16675,20 +11022,9 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -16714,34 +11050,20 @@ define zeroext i64 @test_vpcmpsgeb_v32i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -16763,35 +11085,21 @@ define zeroext i64 @test_vpcmpsgeb_v32i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -16815,43 +11123,23 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $96, %rsp
-; NoVLX-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm2, %xmm2
-; NoVLX-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    shrl $16, %edi
 ; NoVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k2
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -16876,44 +11164,24 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $96, %rsp
-; NoVLX-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm2, %xmm2
-; NoVLX-NEXT:    vmovdqa (%rsi), %ymm3
-; NoVLX-NEXT:    vpcmpgtb %ymm0, %ymm3, %ymm0
-; NoVLX-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
-; NoVLX-NEXT:    vpxor %ymm3, %ymm0, %ymm0
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; NoVLX-NEXT:    vpand %xmm2, %xmm3, %xmm2
-; NoVLX-NEXT:    vpmovsxbd %xmm2, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    shrl $16, %edi
+; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
+; NoVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k2
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -17067,52 +11335,13 @@ define zeroext i32 @test_vpcmpsgew_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -17133,13 +11362,6 @@ define zeroext i32 @test_vpcmpsgew_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vmovdqa (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
@@ -17147,39 +11369,7 @@ define zeroext i32 @test_vpcmpsgew_v8i1_
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -17202,13 +11392,6 @@ define zeroext i32 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
@@ -17216,39 +11399,7 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -17272,13 +11423,6 @@ define zeroext i32 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
@@ -17287,39 +11431,7 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -17344,57 +11456,14 @@ define zeroext i64 @test_vpcmpsgew_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -17415,13 +11484,6 @@ define zeroext i64 @test_vpcmpsgew_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vmovdqa (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
@@ -17429,44 +11491,8 @@ define zeroext i64 @test_vpcmpsgew_v8i1_
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -17489,13 +11515,6 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
@@ -17503,44 +11522,8 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -17564,13 +11547,6 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
@@ -17579,44 +11555,8 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -17642,25 +11582,13 @@ define zeroext i32 @test_vpcmpsgew_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -17682,26 +11610,14 @@ define zeroext i32 @test_vpcmpsgew_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -17725,28 +11641,14 @@ define zeroext i32 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -17771,13 +11673,6 @@ define zeroext i32 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -17785,15 +11680,8 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -17819,30 +11707,14 @@ define zeroext i64 @test_vpcmpsgew_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -17864,31 +11736,15 @@ define zeroext i64 @test_vpcmpsgew_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -17912,33 +11768,15 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -17963,13 +11801,6 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -17977,20 +11808,9 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -18016,205 +11836,191 @@ define zeroext i64 @test_vpcmpsgew_v32i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; NoVLX-NEXT:    vmovq %xmm3, %rax
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    vmovd %eax, %xmm3
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm5
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm8
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
-; NoVLX-NEXT:    vextracti128 $1, %ymm1, %xmm6
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm7
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm4
+; NoVLX-NEXT:    vextracti128 $1, %ymm1, %xmm8
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm6
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm9
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm2, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm6, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm4
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm7, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm6
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm6, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm7, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm7
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm5, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm6
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm7
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm5, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm8, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm5
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm5, %xmm5
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm8, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    vmovq %xmm1, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm3
-; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vinserti128 $1, %xmm9, %ymm4, %ymm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm7, %ymm3, %ymm3
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm1
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtw %ymm3, %ymm1, %ymm1
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm2, %ymm2
+; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
-; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -18236,67 +12042,60 @@ define zeroext i64 @test_vpcmpsgew_v32i1
 ;
 ; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; NoVLX-NEXT:    vmovq %xmm2, %rax
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; NoVLX-NEXT:    vmovq %xmm1, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm1
+; NoVLX-NEXT:    vmovd %eax, %xmm2
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm3
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
@@ -18304,7 +12103,7 @@ define zeroext i64 @test_vpcmpsgew_v32i1
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
@@ -18314,44 +12113,37 @@ define zeroext i64 @test_vpcmpsgew_v32i1
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm2
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
 ; NoVLX-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
 ; NoVLX-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
-; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -18375,51 +12167,23 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $96, %rsp
-; NoVLX-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; NoVLX-NEXT:    vmovq %xmm2, %rax
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; NoVLX-NEXT:    vmovq %xmm3, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm3
+; NoVLX-NEXT:    vmovd %eax, %xmm2
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm4
-; NoVLX-NEXT:    vextracti128 $1, %ymm1, %xmm8
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm6
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm5
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm8
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
+; NoVLX-NEXT:    vextracti128 $1, %ymm1, %xmm6
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm7
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm3, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm9
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
 ; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
@@ -18427,109 +12191,109 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm6, %rcx
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm2, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm6
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vmovq %xmm7, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm5, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm6, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm7
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm7
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm8, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm1, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm6
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm5, %xmm5
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm2
@@ -18537,17 +12301,7 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
-; NoVLX-NEXT:    vinserti128 $1, %xmm9, %ymm4, %ymm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm0
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vinserti128 $1, %xmm7, %ymm3, %ymm3
-; NoVLX-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm4, %xmm4
-; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm1
-; NoVLX-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
@@ -18555,38 +12309,52 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm8, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm2, %ymm2
-; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm3
+; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
+; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    shrl $16, %edi
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; NoVLX-NEXT:    vpcmpgtw %ymm3, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
-; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT:    vpand %xmm4, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k2
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -18611,68 +12379,60 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $96, %rsp
-; NoVLX-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; NoVLX-NEXT:    vmovq %xmm1, %rax
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    vmovd %eax, %xmm1
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm3
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm3, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
@@ -18680,7 +12440,7 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    vmovq %xmm1, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
@@ -18690,56 +12450,40 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm4, %xmm4
-; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; NoVLX-NEXT:    vmovdqa (%rsi), %ymm3
-; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm3, %ymm0
-; NoVLX-NEXT:    vmovdqa 32(%rsi), %ymm3
-; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm1
-; NoVLX-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
-; NoVLX-NEXT:    vpxor %ymm3, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; NoVLX-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    shrl $16, %edi
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; NoVLX-NEXT:    vmovdqa (%rsi), %ymm2
+; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
+; NoVLX-NEXT:    vmovdqa 32(%rsi), %ymm2
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
+; NoVLX-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k2
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -19146,37 +12890,12 @@ define zeroext i32 @test_vpcmpsged_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -19197,37 +12916,12 @@ define zeroext i32 @test_vpcmpsged_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -19250,38 +12944,13 @@ define zeroext i32 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -19306,38 +12975,13 @@ define zeroext i32 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -19363,37 +13007,12 @@ define zeroext i32 @test_vpcmpsged_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -19417,38 +13036,13 @@ define zeroext i32 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -19475,42 +13069,13 @@ define zeroext i64 @test_vpcmpsged_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -19531,42 +13096,13 @@ define zeroext i64 @test_vpcmpsged_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -19589,43 +13125,14 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -19650,43 +13157,14 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -19712,42 +13190,13 @@ define zeroext i64 @test_vpcmpsged_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -19771,43 +13220,14 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -20029,49 +13449,12 @@ define zeroext i32 @test_vpcmpsged_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -20093,49 +13476,12 @@ define zeroext i32 @test_vpcmpsged_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -20159,50 +13505,13 @@ define zeroext i32 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -20227,50 +13536,13 @@ define zeroext i32 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -20296,49 +13568,12 @@ define zeroext i32 @test_vpcmpsged_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -20363,50 +13598,13 @@ define zeroext i32 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -20433,54 +13631,13 @@ define zeroext i64 @test_vpcmpsged_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -20502,54 +13659,13 @@ define zeroext i64 @test_vpcmpsged_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -20573,55 +13689,14 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -20646,55 +13721,14 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -20720,54 +13754,13 @@ define zeroext i64 @test_vpcmpsged_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -20792,55 +13785,14 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -20867,22 +13819,8 @@ define zeroext i32 @test_vpcmpsged_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -20904,22 +13842,8 @@ define zeroext i32 @test_vpcmpsged_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vpcmpnltd (%rdi), %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpcmpnltd (%rdi), %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -20943,23 +13867,9 @@ define zeroext i32 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -20984,23 +13894,9 @@ define zeroext i32 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpnltd (%rsi), %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpnltd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -21026,22 +13922,8 @@ define zeroext i32 @test_vpcmpsged_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vpcmpnltd (%rdi){1to16}, %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpcmpnltd (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -21066,23 +13948,9 @@ define zeroext i32 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpnltd (%rsi){1to16}, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpnltd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -21109,27 +13977,9 @@ define zeroext i64 @test_vpcmpsged_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -21151,27 +14001,9 @@ define zeroext i64 @test_vpcmpsged_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vpcmpnltd (%rdi), %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpcmpnltd (%rdi), %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -21195,28 +14027,10 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -21241,28 +14055,10 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpnltd (%rsi), %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpnltd (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -21288,27 +14084,9 @@ define zeroext i64 @test_vpcmpsged_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vpcmpnltd (%rdi){1to16}, %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpcmpnltd (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -21333,28 +14111,10 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpnltd (%rsi){1to16}, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpnltd (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -21953,31 +14713,12 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -21998,31 +14739,12 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -22045,32 +14767,13 @@ define zeroext i32 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -22095,32 +14798,13 @@ define zeroext i32 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -22146,31 +14830,12 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -22194,32 +14859,13 @@ define zeroext i32 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -22246,36 +14892,13 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -22296,36 +14919,13 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -22348,37 +14948,14 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -22403,37 +14980,14 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -22459,36 +15013,13 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -22512,37 +15043,14 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -22964,37 +15472,12 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -23016,37 +15499,12 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -23070,38 +15528,13 @@ define zeroext i32 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -23127,38 +15560,13 @@ define zeroext i32 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -23185,37 +15593,12 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -23240,38 +15623,13 @@ define zeroext i32 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -23299,42 +15657,13 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -23356,42 +15685,13 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -23415,43 +15715,14 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -23477,43 +15748,14 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -23540,42 +15782,13 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -23600,43 +15813,14 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -23834,47 +16018,8 @@ define zeroext i32 @test_vpcmpsgeq_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -23896,47 +16041,8 @@ define zeroext i32 @test_vpcmpsgeq_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpnltq (%rdi), %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -23960,48 +16066,9 @@ define zeroext i32 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -24026,48 +16093,9 @@ define zeroext i32 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -24093,47 +16121,8 @@ define zeroext i32 @test_vpcmpsgeq_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpnltq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -24158,48 +16147,9 @@ define zeroext i32 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpnltq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -24226,52 +16176,9 @@ define zeroext i64 @test_vpcmpsgeq_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -24293,52 +16200,9 @@ define zeroext i64 @test_vpcmpsgeq_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpnltq (%rdi), %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -24362,53 +16226,10 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -24433,53 +16254,10 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -24505,52 +16283,9 @@ define zeroext i64 @test_vpcmpsgeq_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpnltq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -24575,53 +16310,10 @@ define zeroext i64 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpnltq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -24647,25 +16339,13 @@ define zeroext i32 @test_vpcmpultb_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -24686,25 +16366,13 @@ define zeroext i32 @test_vpcmpultb_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -24727,28 +16395,14 @@ define zeroext i32 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -24772,28 +16426,14 @@ define zeroext i32 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor (%rsi), %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -24818,30 +16458,14 @@ define zeroext i64 @test_vpcmpultb_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -24862,30 +16486,14 @@ define zeroext i64 @test_vpcmpultb_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -24908,33 +16516,15 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -24958,33 +16548,15 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor (%rsi), %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -25010,33 +16582,19 @@ define zeroext i64 @test_vpcmpultb_v32i1
 ;
 ; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -25058,33 +16616,19 @@ define zeroext i64 @test_vpcmpultb_v32i1
 ;
 ; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpxor (%rdi), %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -25108,44 +16652,22 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $96, %rsp
-; NoVLX-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm2, %xmm2
-; NoVLX-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm3, %xmm3
-; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; NoVLX-NEXT:    vpxor %ymm4, %ymm0, %ymm0
-; NoVLX-NEXT:    vpxor %ymm4, %ymm1, %ymm1
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    shrl $16, %edi
+; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; NoVLX-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k2
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -25170,44 +16692,22 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $96, %rsp
-; NoVLX-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
-; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm2, %xmm2
-; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; NoVLX-NEXT:    vpxor %ymm3, %ymm0, %ymm0
-; NoVLX-NEXT:    vpxor (%rsi), %ymm3, %ymm3
-; NoVLX-NEXT:    vpcmpgtb %ymm0, %ymm3, %ymm0
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; NoVLX-NEXT:    vpand %xmm2, %xmm3, %xmm2
-; NoVLX-NEXT:    vpmovsxbd %xmm2, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    shrl $16, %edi
+; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpxor (%rsi), %ymm1, %ymm1
+; NoVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k2
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -25359,52 +16859,13 @@ define zeroext i32 @test_vpcmpultw_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -25425,52 +16886,13 @@ define zeroext i32 @test_vpcmpultw_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -25493,13 +16915,6 @@ define zeroext i32 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
@@ -25507,39 +16922,7 @@ define zeroext i32 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -25563,13 +16946,6 @@ define zeroext i32 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor (%rsi), %xmm1, %xmm1
@@ -25577,39 +16953,7 @@ define zeroext i32 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -25634,57 +16978,14 @@ define zeroext i64 @test_vpcmpultw_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -25705,57 +17006,14 @@ define zeroext i64 @test_vpcmpultw_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm1
 ; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -25778,13 +17036,6 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
@@ -25792,44 +17043,8 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -25853,13 +17068,6 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor (%rsi), %xmm1, %xmm1
@@ -25867,44 +17075,8 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -25930,25 +17102,13 @@ define zeroext i32 @test_vpcmpultw_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -25970,25 +17130,13 @@ define zeroext i32 @test_vpcmpultw_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpxor (%rdi), %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -26012,28 +17160,14 @@ define zeroext i32 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -26058,28 +17192,14 @@ define zeroext i32 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpxor (%rsi), %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -26105,30 +17225,14 @@ define zeroext i64 @test_vpcmpultw_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -26150,30 +17254,14 @@ define zeroext i64 @test_vpcmpultw_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpxor (%rdi), %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -26197,33 +17285,15 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -26248,33 +17318,15 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpxor (%rsi), %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -26300,30 +17352,43 @@ define zeroext i64 @test_vpcmpultw_v32i1
 ;
 ; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; NoVLX-NEXT:    vmovq %xmm3, %rax
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    vmovd %eax, %xmm3
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm5
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm8
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
-; NoVLX-NEXT:    vextracti128 $1, %ymm1, %xmm6
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm7
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm4
+; NoVLX-NEXT:    vextracti128 $1, %ymm1, %xmm8
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm6
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm2
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm3, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm9
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
@@ -26331,109 +17396,109 @@ define zeroext i64 @test_vpcmpultw_v32i1
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    vmovq %xmm6, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm4
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm2, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm6
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    vmovq %xmm7, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm6, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm5, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm7
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm7
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm5, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovq %xmm8, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm6
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm5
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm5, %xmm5
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    vmovq %xmm1, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm2
@@ -26441,7 +17506,10 @@ define zeroext i64 @test_vpcmpultw_v32i1
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vinserti128 $1, %xmm9, %ymm4, %ymm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm7, %ymm3, %ymm3
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
@@ -26449,56 +17517,26 @@ define zeroext i64 @test_vpcmpultw_v32i1
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm8, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm3
-; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm4
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm4, %xmm4
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm4, %xmm4
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
+; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm2, %ymm2
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm4 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT:    vpxor %ymm4, %ymm3, %ymm3
-; NoVLX-NEXT:    vpxor %ymm4, %ymm2, %ymm2
-; NoVLX-NEXT:    vpcmpgtw %ymm3, %ymm2, %ymm2
-; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpxor %ymm4, %ymm0, %ymm0
-; NoVLX-NEXT:    vpxor %ymm4, %ymm1, %ymm1
+; NoVLX-NEXT:    vpxor %ymm4, %ymm2, %ymm2
+; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vpxor %ymm4, %ymm1, %ymm0
+; NoVLX-NEXT:    vpxor %ymm4, %ymm3, %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -26520,67 +17558,60 @@ define zeroext i64 @test_vpcmpultw_v32i1
 ;
 ; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; NoVLX-NEXT:    vmovq %xmm2, %rax
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; NoVLX-NEXT:    vmovq %xmm1, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm1
+; NoVLX-NEXT:    vmovd %eax, %xmm2
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm3
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
@@ -26588,7 +17619,7 @@ define zeroext i64 @test_vpcmpultw_v32i1
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
@@ -26598,42 +17629,35 @@ define zeroext i64 @test_vpcmpultw_v32i1
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
-; NoVLX-NEXT:    vpxor 32(%rdi), %ymm2, %ymm3
-; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm1
-; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
-; NoVLX-NEXT:    vpxor (%rdi), %ymm2, %ymm1
+; NoVLX-NEXT:    vpxor (%rdi), %ymm2, %ymm3
+; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm3, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm0
+; NoVLX-NEXT:    vpxor 32(%rdi), %ymm2, %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -26657,51 +17681,23 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $96, %rsp
-; NoVLX-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; NoVLX-NEXT:    vmovq %xmm2, %rax
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; NoVLX-NEXT:    vmovq %xmm3, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm3
+; NoVLX-NEXT:    vmovd %eax, %xmm2
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm4
-; NoVLX-NEXT:    vextracti128 $1, %ymm1, %xmm8
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm6
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm5
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm8
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
+; NoVLX-NEXT:    vextracti128 $1, %ymm1, %xmm6
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm7
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm2
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm3, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm9
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
 ; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
@@ -26709,109 +17705,109 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm6, %rcx
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm2, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm6
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vmovq %xmm7, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm5, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm6, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm7
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm7
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm8, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm1, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm6
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm5, %xmm5
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm2
@@ -26819,16 +17815,7 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
-; NoVLX-NEXT:    vinserti128 $1, %xmm9, %ymm4, %ymm1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm0
-; NoVLX-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm4, %xmm4
-; NoVLX-NEXT:    vinserti128 $1, %xmm7, %ymm3, %ymm3
-; NoVLX-NEXT:    vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm6, %xmm6
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
@@ -26836,41 +17823,52 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovq %xmm8, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm2, %ymm2
-; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm5 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT:    vpxor %ymm5, %ymm0, %ymm0
-; NoVLX-NEXT:    vpxor %ymm5, %ymm2, %ymm2
-; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm3
+; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    shrl $16, %edi
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
+; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm4 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT:    vpxor %ymm4, %ymm3, %ymm3
+; NoVLX-NEXT:    vpxor %ymm4, %ymm2, %ymm2
+; NoVLX-NEXT:    vpcmpgtw %ymm3, %ymm2, %ymm2
+; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT:    vpxor %ymm4, %ymm0, %ymm0
+; NoVLX-NEXT:    vpxor %ymm4, %ymm1, %ymm1
+; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT:    vpand %xmm4, %xmm0, %xmm0
-; NoVLX-NEXT:    vpxor %ymm5, %ymm1, %ymm1
-; NoVLX-NEXT:    vpxor %ymm5, %ymm3, %ymm2
-; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
-; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm6, %xmm1, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k2
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0 {%k2}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -26895,28 +17893,20 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultw_v32i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $96, %rsp
-; NoVLX-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vmovq %xmm1, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
 ; NoVLX-NEXT:    vmovd %eax, %xmm2
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm3
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
 ; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm1
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
@@ -26924,39 +17914,39 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm3, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
@@ -26964,7 +17954,7 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm0, %rcx
+; NoVLX-NEXT:    vmovq %xmm2, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
@@ -26974,56 +17964,38 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm4, %xmm4
-; NoVLX-NEXT:    vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    shrl $16, %edi
+; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm1
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
-; NoVLX-NEXT:    vpxor (%rsi), %ymm2, %ymm3
-; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm3, %ymm0
-; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
-; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT:    vpand %xmm4, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
-; NoVLX-NEXT:    vpxor 32(%rsi), %ymm2, %ymm2
-; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm1
+; NoVLX-NEXT:    vpxor 32(%rsi), %ymm2, %ymm3
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm1
 ; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm5, %xmm1, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %eax
-; NoVLX-NEXT:    shlq $32, %rax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT:    vpxor (%rsi), %ymm2, %ymm2
+; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
+; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT:    kmovw %edi, %k2
+; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    shll $16, %eax
+; NoVLX-NEXT:    orl %ecx, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -27430,37 +18402,12 @@ define zeroext i32 @test_vpcmpultd_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -27481,37 +18428,12 @@ define zeroext i32 @test_vpcmpultd_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -27534,38 +18456,13 @@ define zeroext i32 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -27590,38 +18487,13 @@ define zeroext i32 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -27647,37 +18519,12 @@ define zeroext i32 @test_vpcmpultd_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultd_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -27701,38 +18548,13 @@ define zeroext i32 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -27759,42 +18581,13 @@ define zeroext i64 @test_vpcmpultd_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -27815,42 +18608,13 @@ define zeroext i64 @test_vpcmpultd_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -27873,43 +18637,14 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -27934,43 +18669,14 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -27996,42 +18702,13 @@ define zeroext i64 @test_vpcmpultd_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultd_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -28055,43 +18732,14 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultd_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -28313,49 +18961,12 @@ define zeroext i32 @test_vpcmpultd_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -28377,49 +18988,12 @@ define zeroext i32 @test_vpcmpultd_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -28443,50 +19017,13 @@ define zeroext i32 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -28511,50 +19048,13 @@ define zeroext i32 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -28580,49 +19080,12 @@ define zeroext i32 @test_vpcmpultd_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultd_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -28647,50 +19110,13 @@ define zeroext i32 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -28717,54 +19143,13 @@ define zeroext i64 @test_vpcmpultd_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -28786,54 +19171,13 @@ define zeroext i64 @test_vpcmpultd_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -28857,55 +19201,14 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -28930,55 +19233,14 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -29004,54 +19266,13 @@ define zeroext i64 @test_vpcmpultd_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultd_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -29076,55 +19297,14 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultd_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -29151,22 +19331,8 @@ define zeroext i32 @test_vpcmpultd_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -29188,22 +19354,8 @@ define zeroext i32 @test_vpcmpultd_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vpcmpltud (%rdi), %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpcmpltud (%rdi), %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -29227,23 +19379,9 @@ define zeroext i32 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -29268,23 +19406,9 @@ define zeroext i32 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpltud (%rsi), %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpltud (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -29310,22 +19434,8 @@ define zeroext i32 @test_vpcmpultd_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpultd_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vpcmpltud (%rdi){1to16}, %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpcmpltud (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -29350,23 +19460,9 @@ define zeroext i32 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpltud (%rsi){1to16}, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -29393,27 +19489,9 @@ define zeroext i64 @test_vpcmpultd_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -29435,27 +19513,9 @@ define zeroext i64 @test_vpcmpultd_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vpcmpltud (%rdi), %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpcmpltud (%rdi), %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -29479,28 +19539,10 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -29525,28 +19567,10 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpltud (%rsi), %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpltud (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -29572,27 +19596,9 @@ define zeroext i64 @test_vpcmpultd_v16i1
 ;
 ; NoVLX-LABEL: test_vpcmpultd_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vpcmpltud (%rdi){1to16}, %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vpcmpltud (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -29617,28 +19623,10 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultd_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vpcmpltud (%rsi){1to16}, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vpcmpltud (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -30237,31 +20225,12 @@ define zeroext i32 @test_vpcmpultq_v2i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -30282,31 +20251,12 @@ define zeroext i32 @test_vpcmpultq_v2i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -30329,32 +20279,13 @@ define zeroext i32 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -30379,32 +20310,13 @@ define zeroext i32 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -30430,31 +20342,12 @@ define zeroext i32 @test_vpcmpultq_v2i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultq_v2i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -30478,32 +20371,13 @@ define zeroext i32 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -30530,36 +20404,13 @@ define zeroext i64 @test_vpcmpultq_v2i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -30580,36 +20431,13 @@ define zeroext i64 @test_vpcmpultq_v2i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -30632,37 +20460,14 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -30687,37 +20492,14 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -30743,36 +20525,13 @@ define zeroext i64 @test_vpcmpultq_v2i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultq_v2i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -30796,37 +20555,14 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultq_v2i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -31248,37 +20984,12 @@ define zeroext i32 @test_vpcmpultq_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -31300,37 +21011,12 @@ define zeroext i32 @test_vpcmpultq_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -31354,38 +21040,13 @@ define zeroext i32 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -31411,38 +21072,13 @@ define zeroext i32 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -31469,37 +21105,12 @@ define zeroext i32 @test_vpcmpultq_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultq_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -31524,38 +21135,13 @@ define zeroext i32 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -31583,42 +21169,13 @@ define zeroext i64 @test_vpcmpultq_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -31640,42 +21197,13 @@ define zeroext i64 @test_vpcmpultq_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -31699,43 +21227,14 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -31761,43 +21260,14 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -31824,42 +21294,13 @@ define zeroext i64 @test_vpcmpultq_v4i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultq_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -31884,43 +21325,14 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultq_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -32118,47 +21530,8 @@ define zeroext i32 @test_vpcmpultq_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -32180,47 +21553,8 @@ define zeroext i32 @test_vpcmpultq_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpltuq (%rdi), %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -32244,48 +21578,9 @@ define zeroext i32 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -32310,48 +21605,9 @@ define zeroext i32 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -32377,47 +21633,8 @@ define zeroext i32 @test_vpcmpultq_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultq_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpltuq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -32442,48 +21659,9 @@ define zeroext i32 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -32510,52 +21688,9 @@ define zeroext i64 @test_vpcmpultq_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -32577,52 +21712,9 @@ define zeroext i64 @test_vpcmpultq_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpltuq (%rdi), %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -32646,53 +21738,10 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -32717,53 +21766,10 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltuq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -32789,52 +21795,9 @@ define zeroext i64 @test_vpcmpultq_v8i1_
 ;
 ; NoVLX-LABEL: test_vpcmpultq_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpltuq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -32859,53 +21822,10 @@ define zeroext i64 @test_masked_vpcmpult
 ;
 ; NoVLX-LABEL: test_masked_vpcmpultq_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpltuq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -33308,37 +22228,12 @@ define zeroext i32 @test_vcmpoeqps_v4i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -33359,37 +22254,12 @@ define zeroext i32 @test_vcmpoeqps_v4i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovaps (%rdi), %xmm1
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -33411,37 +22281,12 @@ define zeroext i32 @test_vcmpoeqps_v4i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vbroadcastss (%rdi), %xmm1
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -33465,38 +22310,13 @@ define zeroext i32 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -33520,38 +22340,13 @@ define zeroext i32 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vmovaps (%rsi), %xmm1
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -33576,38 +22371,13 @@ define zeroext i32 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vbroadcastss (%rsi), %xmm1
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -33634,42 +22404,13 @@ define zeroext i64 @test_vcmpoeqps_v4i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -33690,42 +22431,13 @@ define zeroext i64 @test_vcmpoeqps_v4i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovaps (%rdi), %xmm1
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -33747,42 +22459,13 @@ define zeroext i64 @test_vcmpoeqps_v4i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqps_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vbroadcastss (%rdi), %xmm1
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -33806,43 +22489,14 @@ define zeroext i64 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -33866,43 +22520,14 @@ define zeroext i64 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vmovaps (%rsi), %xmm1
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -33927,43 +22552,14 @@ define zeroext i64 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vbroadcastss (%rsi), %xmm1
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -34185,49 +22781,12 @@ define zeroext i32 @test_vcmpoeqps_v8i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -34249,49 +22808,12 @@ define zeroext i32 @test_vcmpoeqps_v8i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovaps (%rdi), %ymm1
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -34314,49 +22836,12 @@ define zeroext i32 @test_vcmpoeqps_v8i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vbroadcastss (%rdi), %ymm1
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -34381,50 +22866,13 @@ define zeroext i32 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -34449,50 +22897,13 @@ define zeroext i32 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovaps (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -34518,50 +22929,13 @@ define zeroext i32 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vbroadcastss (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -34589,54 +22963,13 @@ define zeroext i64 @test_vcmpoeqps_v8i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -34658,54 +22991,13 @@ define zeroext i64 @test_vcmpoeqps_v8i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovaps (%rdi), %ymm1
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -34728,54 +23020,13 @@ define zeroext i64 @test_vcmpoeqps_v8i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqps_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vbroadcastss (%rdi), %ymm1
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -34800,55 +23051,14 @@ define zeroext i64 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -34873,55 +23083,14 @@ define zeroext i64 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovaps (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -34947,55 +23116,14 @@ define zeroext i64 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vbroadcastss (%rsi), %ymm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $8, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -35023,22 +23151,8 @@ define zeroext i32 @test_vcmpoeqps_v16i1
 ;
 ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -35060,22 +23174,8 @@ define zeroext i32 @test_vcmpoeqps_v16i1
 ;
 ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vcmpeqps (%rdi), %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vcmpeqps (%rdi), %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -35098,22 +23198,8 @@ define zeroext i32 @test_vcmpoeqps_v16i1
 ;
 ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vcmpeqps (%rdi){1to16}, %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vcmpeqps (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -35138,23 +23224,9 @@ define zeroext i32 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -35179,23 +23251,9 @@ define zeroext i32 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vcmpeqps (%rsi), %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmpeqps (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -35221,23 +23279,9 @@ define zeroext i32 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vcmpeqps (%rsi){1to16}, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -35306,27 +23350,9 @@ define zeroext i64 @test_vcmpoeqps_v16i1
 ;
 ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -35348,27 +23374,9 @@ define zeroext i64 @test_vcmpoeqps_v16i1
 ;
 ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vcmpeqps (%rdi), %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vcmpeqps (%rdi), %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -35391,27 +23399,9 @@ define zeroext i64 @test_vcmpoeqps_v16i1
 ;
 ; NoVLX-LABEL: test_vcmpoeqps_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vcmpeqps (%rdi){1to16}, %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vcmpeqps (%rdi){1to16}, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -35436,28 +23426,10 @@ define zeroext i64 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmpeqps %zmm1, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -35482,28 +23454,10 @@ define zeroext i64 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vcmpeqps (%rsi), %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmpeqps (%rsi), %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -35529,28 +23483,10 @@ define zeroext i64 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqps_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    kmovw %edi, %k1
-; NoVLX-NEXT:    vcmpeqps (%rsi){1to16}, %zmm0, %k1 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %edi, %k1
+; NoVLX-NEXT:    vcmpeqps (%rsi){1to16}, %zmm0, %k0 {%k1}
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -36194,31 +24130,12 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -36239,31 +24156,12 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovapd (%rdi), %xmm1
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -36285,31 +24183,12 @@ define zeroext i32 @test_vcmpoeqpd_v2i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -36333,32 +24212,13 @@ define zeroext i32 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -36382,32 +24242,13 @@ define zeroext i32 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vmovapd (%rsi), %xmm1
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -36432,32 +24273,13 @@ define zeroext i32 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -36484,36 +24306,13 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -36534,36 +24333,13 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovapd (%rdi), %xmm1
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -36585,36 +24361,13 @@ define zeroext i64 @test_vcmpoeqpd_v2i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -36638,37 +24391,14 @@ define zeroext i64 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm1 killed %xmm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -36692,37 +24422,14 @@ define zeroext i64 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vmovapd (%rsi), %xmm1
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -36747,37 +24454,14 @@ define zeroext i64 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v2i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %xmm0 killed %xmm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vmovddup {{.*#+}} xmm1 = mem[0,0]
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kmovw %k0, %ecx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $14, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -37193,37 +24877,12 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -37245,37 +24904,12 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovapd (%rdi), %ymm1
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -37298,37 +24932,12 @@ define zeroext i32 @test_vcmpoeqpd_v4i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vbroadcastsd (%rdi), %ymm1
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -37353,38 +24962,13 @@ define zeroext i32 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -37409,38 +24993,13 @@ define zeroext i32 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vmovapd (%rsi), %ymm1
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -37466,38 +25025,13 @@ define zeroext i32 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vbroadcastsd (%rsi), %ymm1
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -37525,42 +25059,13 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -37582,42 +25087,13 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vmovapd (%rdi), %ymm1
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -37640,42 +25116,13 @@ define zeroext i64 @test_vcmpoeqpd_v4i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    vbroadcastsd (%rdi), %ymm1
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -37700,43 +25147,14 @@ define zeroext i64 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm1 killed %ymm1 def %zmm1
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -37761,43 +25179,14 @@ define zeroext i64 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vmovapd (%rsi), %ymm1
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -37823,43 +25212,14 @@ define zeroext i64 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: def %ymm0 killed %ymm0 def %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vbroadcastsd (%rsi), %ymm1
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
-; NoVLX-NEXT:    kmovw %k0, %esi
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kshiftlw $12, %k0, %k0
+; NoVLX-NEXT:    kshiftrw $12, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -38113,47 +25473,8 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -38175,47 +25496,8 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vcmpeqpd (%rdi), %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -38238,47 +25520,8 @@ define zeroext i32 @test_vcmpoeqpd_v8i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vcmpeqpd (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -38303,48 +25546,9 @@ define zeroext i32 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -38369,48 +25573,9 @@ define zeroext i32 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vcmpeqpd (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -38436,48 +25601,9 @@ define zeroext i32 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -38555,52 +25681,9 @@ define zeroext i64 @test_vcmpoeqpd_v8i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -38622,52 +25705,9 @@ define zeroext i64 @test_vcmpoeqpd_v8i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vcmpeqpd (%rdi), %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -38690,52 +25730,9 @@ define zeroext i64 @test_vcmpoeqpd_v8i1_
 ;
 ; NoVLX-LABEL: test_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vcmpeqpd (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -38760,53 +25757,10 @@ define zeroext i64 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vcmpeqpd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -38831,53 +25785,10 @@ define zeroext i64 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vcmpeqpd (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -38903,53 +25814,10 @@ define zeroext i64 @test_masked_vcmpoeqp
 ;
 ; NoVLX-LABEL: test_masked_vcmpoeqpd_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # %bb.0: # %entry
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vcmpeqpd (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kshiftrw $7, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
-; NoVLX-NEXT:    kshiftrw $6, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
-; NoVLX-NEXT:    kshiftrw $5, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
-; NoVLX-NEXT:    kshiftrw $4, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
-; NoVLX-NEXT:    kshiftrw $3, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
-; NoVLX-NEXT:    kshiftrw $2, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftrw $1, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    kmovw %k0, %edx
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
-; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; NoVLX-NEXT:    shlq $32, %rcx
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    orq %rcx, %rax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    movzwl %ax, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
 entry:
@@ -39028,22 +25896,8 @@ define i32 @test_cmpm_rnd_zero(<16 x flo
 ;
 ; NoVLX-LABEL: test_cmpm_rnd_zero:
 ; NoVLX:       # %bb.0:
-; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:    .cfi_offset %rbp, -16
-; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    andq $-32, %rsp
-; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vcmpleps {sae}, %zmm1, %zmm0, %k1
-; NoVLX-NEXT:    kxorw %k0, %k0, %k0
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kmovw %k0, (%rsp)
-; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    movq %rbp, %rsp
-; NoVLX-NEXT:    popq %rbp
+; NoVLX-NEXT:    vcmpleps {sae}, %zmm1, %zmm0, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vzeroupper
 ; NoVLX-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8)

Modified: llvm/trunk/test/CodeGen/X86/bitcast-and-setcc-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/bitcast-and-setcc-256.ll?rev=323201&r1=323200&r2=323201&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/bitcast-and-setcc-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/bitcast-and-setcc-256.ll Tue Jan 23 06:25:39 2018
@@ -411,28 +411,22 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8
 ;
 ; AVX512F-LABEL: v32i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rbp
-; AVX512F-NEXT:    .cfi_def_cfa_offset 16
-; AVX512F-NEXT:    .cfi_offset %rbp, -16
-; AVX512F-NEXT:    movq %rsp, %rbp
-; AVX512F-NEXT:    .cfi_def_cfa_register %rbp
-; AVX512F-NEXT:    andq $-32, %rsp
-; AVX512F-NEXT:    subq $32, %rsp
 ; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpcmpgtb %ymm3, %ymm2, %ymm1
-; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
-; AVX512F-NEXT:    vpslld $31, %zmm1, %zmm1
-; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; AVX512F-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
-; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT:    kmovw %k0, (%rsp)
-; AVX512F-NEXT:    movl (%rsp), %eax
-; AVX512F-NEXT:    movq %rbp, %rsp
-; AVX512F-NEXT:    popq %rbp
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k2
+; AVX512F-NEXT:    vpcmpgtb %ymm3, %ymm2, %ymm0
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k2}
+; AVX512F-NEXT:    kmovw %k0, %ecx
+; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k1}
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    shll $16, %eax
+; AVX512F-NEXT:    orl %ecx, %eax
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;

Modified: llvm/trunk/test/CodeGen/X86/bitcast-and-setcc-512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/bitcast-and-setcc-512.ll?rev=323201&r1=323200&r2=323201&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/bitcast-and-setcc-512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/bitcast-and-setcc-512.ll Tue Jan 23 06:25:39 2018
@@ -280,38 +280,22 @@ define i32 @v32i16(<32 x i16> %a, <32 x
 ;
 ; AVX512F-LABEL: v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rbp
-; AVX512F-NEXT:    .cfi_def_cfa_offset 16
-; AVX512F-NEXT:    .cfi_offset %rbp, -16
-; AVX512F-NEXT:    movq %rsp, %rbp
-; AVX512F-NEXT:    .cfi_def_cfa_register %rbp
-; AVX512F-NEXT:    andq $-32, %rsp
-; AVX512F-NEXT:    subq $32, %rsp
+; AVX512F-NEXT:    vpcmpgtw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vpcmpgtw %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k2
+; AVX512F-NEXT:    vpcmpgtw %ymm7, %ymm5, %ymm0
+; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vpcmpgtw %ymm6, %ymm4, %ymm1
 ; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
-; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT:    vpcmpgtw %ymm6, %ymm4, %ymm2
-; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
-; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
-; AVX512F-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT:    vpcmpgtw %ymm7, %ymm5, %ymm2
-; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
-; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
-; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
-; AVX512F-NEXT:    vpslld $31, %zmm1, %zmm1
-; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; AVX512F-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
-; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT:    kmovw %k0, (%rsp)
-; AVX512F-NEXT:    movl (%rsp), %eax
-; AVX512F-NEXT:    movq %rbp, %rsp
-; AVX512F-NEXT:    popq %rbp
+; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k2}
+; AVX512F-NEXT:    kmovw %k0, %ecx
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    shll $16, %eax
+; AVX512F-NEXT:    orl %ecx, %eax
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -1332,43 +1316,40 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8
 ;
 ; AVX512F-LABEL: v64i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rbp
-; AVX512F-NEXT:    .cfi_def_cfa_offset 16
-; AVX512F-NEXT:    .cfi_offset %rbp, -16
-; AVX512F-NEXT:    movq %rsp, %rbp
-; AVX512F-NEXT:    .cfi_def_cfa_register %rbp
-; AVX512F-NEXT:    andq $-32, %rsp
-; AVX512F-NEXT:    subq $64, %rsp
 ; AVX512F-NEXT:    vpcmpgtb %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; AVX512F-NEXT:    vpmovsxbd %xmm3, %zmm3
+; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k1
+; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
+; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k2
 ; AVX512F-NEXT:    vpcmpgtb %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vpcmpgtb %ymm7, %ymm5, %ymm2
-; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vpcmpgtb %ymm6, %ymm4, %ymm2
-; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT:    vpmovsxbd %xmm2, %zmm2
-; AVX512F-NEXT:    vpslld $31, %zmm2, %zmm2
-; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k0
-; AVX512F-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
+; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k3
 ; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
-; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT:    kmovw %k0, (%rsp)
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k4
+; AVX512F-NEXT:    vpcmpgtb %ymm7, %ymm5, %ymm0
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
-; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm0
-; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
-; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movl (%rsp), %ecx
-; AVX512F-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX512F-NEXT:    vpcmpgtb %ymm6, %ymm4, %ymm2
+; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512F-NEXT:    vpmovsxbd %xmm3, %zmm3
+; AVX512F-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k0 {%k4}
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k0 {%k3}
+; AVX512F-NEXT:    kmovw %k0, %ecx
+; AVX512F-NEXT:    shll $16, %ecx
+; AVX512F-NEXT:    orl %eax, %ecx
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k2}
+; AVX512F-NEXT:    kmovw %k0, %edx
+; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0 {%k1}
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    shll $16, %eax
+; AVX512F-NEXT:    orl %edx, %eax
 ; AVX512F-NEXT:    shlq $32, %rax
 ; AVX512F-NEXT:    orq %rcx, %rax
-; AVX512F-NEXT:    movq %rbp, %rsp
-; AVX512F-NEXT:    popq %rbp
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;

Modified: llvm/trunk/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll?rev=323201&r1=323200&r2=323201&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll (original)
+++ llvm/trunk/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll Tue Jan 23 06:25:39 2018
@@ -505,24 +505,15 @@ define <32 x i8> @ext_i32_32i8(i32 %a0)
 ;
 ; AVX512F-LABEL: ext_i32_32i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rbp
-; AVX512F-NEXT:    .cfi_def_cfa_offset 16
-; AVX512F-NEXT:    .cfi_offset %rbp, -16
-; AVX512F-NEXT:    movq %rsp, %rbp
-; AVX512F-NEXT:    .cfi_def_cfa_register %rbp
-; AVX512F-NEXT:    andq $-32, %rsp
-; AVX512F-NEXT:    subq $32, %rsp
-; AVX512F-NEXT:    movl %edi, (%rsp)
+; AVX512F-NEXT:    kmovw %edi, %k1
+; AVX512F-NEXT:    shrl $16, %edi
+; AVX512F-NEXT:    kmovw %edi, %k2
 ; AVX512F-NEXT:    movl {{.*}}(%rip), %eax
-; AVX512F-NEXT:    kmovw (%rsp), %k1
-; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
 ; AVX512F-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpbroadcastd %eax, %zmm1 {%k2} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT:    movq %rbp, %rsp
-; AVX512F-NEXT:    popq %rbp
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: ext_i32_32i8:
@@ -792,27 +783,14 @@ define <32 x i16> @ext_i32_32i16(i32 %a0
 ;
 ; AVX512F-LABEL: ext_i32_32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rbp
-; AVX512F-NEXT:    .cfi_def_cfa_offset 16
-; AVX512F-NEXT:    .cfi_offset %rbp, -16
-; AVX512F-NEXT:    movq %rsp, %rbp
-; AVX512F-NEXT:    .cfi_def_cfa_register %rbp
-; AVX512F-NEXT:    andq $-32, %rsp
-; AVX512F-NEXT:    subq $32, %rsp
-; AVX512F-NEXT:    movl %edi, (%rsp)
-; AVX512F-NEXT:    kmovw (%rsp), %k1
-; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512F-NEXT:    vpand %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512F-NEXT:    movq %rbp, %rsp
-; AVX512F-NEXT:    popq %rbp
+; AVX512F-NEXT:    kmovw %edi, %k1
+; AVX512F-NEXT:    shrl $16, %edi
+; AVX512F-NEXT:    kmovw %edi, %k2
+; AVX512F-NEXT:    movl {{.*}}(%rip), %eax
+; AVX512F-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    vpbroadcastd %eax, %zmm1 {%k2} {z}
+; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: ext_i32_32i16:
@@ -950,33 +928,27 @@ define <64 x i8> @ext_i64_64i8(i64 %a0)
 ;
 ; AVX512F-LABEL: ext_i64_64i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rbp
-; AVX512F-NEXT:    .cfi_def_cfa_offset 16
-; AVX512F-NEXT:    .cfi_offset %rbp, -16
-; AVX512F-NEXT:    movq %rsp, %rbp
-; AVX512F-NEXT:    .cfi_def_cfa_register %rbp
-; AVX512F-NEXT:    andq $-32, %rsp
-; AVX512F-NEXT:    subq $64, %rsp
-; AVX512F-NEXT:    movl %edi, (%rsp)
-; AVX512F-NEXT:    shrq $32, %rdi
-; AVX512F-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    movq %rdi, %rcx
+; AVX512F-NEXT:    kmovw %edi, %k1
+; AVX512F-NEXT:    movl %edi, %edx
+; AVX512F-NEXT:    shrl $16, %edx
+; AVX512F-NEXT:    shrq $32, %rax
+; AVX512F-NEXT:    shrq $48, %rcx
+; AVX512F-NEXT:    kmovw %ecx, %k2
+; AVX512F-NEXT:    kmovw %eax, %k3
+; AVX512F-NEXT:    kmovw %edx, %k4
 ; AVX512F-NEXT:    movl {{.*}}(%rip), %eax
-; AVX512F-NEXT:    kmovw (%rsp), %k1
-; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
 ; AVX512F-NEXT:    vpbroadcastd %eax, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT:    vpbroadcastd %eax, %zmm1 {%k2} {z}
+; AVX512F-NEXT:    vpbroadcastd %eax, %zmm1 {%k4} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; AVX512F-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; AVX512F-NEXT:    vpbroadcastd %eax, %zmm1 {%k1} {z}
+; AVX512F-NEXT:    vpbroadcastd %eax, %zmm1 {%k3} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512F-NEXT:    vpbroadcastd %eax, %zmm2 {%k2} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm2, %xmm2
 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512F-NEXT:    movq %rbp, %rsp
-; AVX512F-NEXT:    popq %rbp
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: ext_i64_64i8:

Modified: llvm/trunk/test/CodeGen/X86/bitcast-setcc-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/bitcast-setcc-256.ll?rev=323201&r1=323200&r2=323201&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/bitcast-setcc-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/bitcast-setcc-256.ll Tue Jan 23 06:25:39 2018
@@ -183,24 +183,16 @@ define i32 @v32i8(<32 x i8> %a, <32 x i8
 ;
 ; AVX512F-LABEL: v32i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rbp
-; AVX512F-NEXT:    .cfi_def_cfa_offset 16
-; AVX512F-NEXT:    .cfi_offset %rbp, -16
-; AVX512F-NEXT:    movq %rsp, %rbp
-; AVX512F-NEXT:    .cfi_def_cfa_register %rbp
-; AVX512F-NEXT:    andq $-32, %rsp
-; AVX512F-NEXT:    subq $32, %rsp
 ; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm1
 ; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; AVX512F-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    kmovw %k0, %ecx
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT:    kmovw %k0, (%rsp)
-; AVX512F-NEXT:    movl (%rsp), %eax
-; AVX512F-NEXT:    movq %rbp, %rsp
-; AVX512F-NEXT:    popq %rbp
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    shll $16, %eax
+; AVX512F-NEXT:    orl %ecx, %eax
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;

Modified: llvm/trunk/test/CodeGen/X86/bitcast-setcc-512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/bitcast-setcc-512.ll?rev=323201&r1=323200&r2=323201&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/bitcast-setcc-512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/bitcast-setcc-512.ll Tue Jan 23 06:25:39 2018
@@ -51,24 +51,16 @@ define i32 @v32i16(<32 x i16> %a, <32 x
 ;
 ; AVX512F-LABEL: v32i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rbp
-; AVX512F-NEXT:    .cfi_def_cfa_offset 16
-; AVX512F-NEXT:    .cfi_offset %rbp, -16
-; AVX512F-NEXT:    movq %rsp, %rbp
-; AVX512F-NEXT:    .cfi_def_cfa_register %rbp
-; AVX512F-NEXT:    andq $-32, %rsp
-; AVX512F-NEXT:    subq $32, %rsp
-; AVX512F-NEXT:    vpcmpgtw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
-; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; AVX512F-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT:    kmovw %k0, (%rsp)
-; AVX512F-NEXT:    movl (%rsp), %eax
-; AVX512F-NEXT:    movq %rbp, %rsp
-; AVX512F-NEXT:    popq %rbp
+; AVX512F-NEXT:    kmovw %k0, %ecx
+; AVX512F-NEXT:    vpcmpgtw %ymm3, %ymm1, %ymm0
+; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    shll $16, %eax
+; AVX512F-NEXT:    orl %ecx, %eax
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
@@ -1006,35 +998,28 @@ define i64 @v64i8(<64 x i8> %a, <64 x i8
 ;
 ; AVX512F-LABEL: v64i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rbp
-; AVX512F-NEXT:    .cfi_def_cfa_offset 16
-; AVX512F-NEXT:    .cfi_offset %rbp, -16
-; AVX512F-NEXT:    movq %rsp, %rbp
-; AVX512F-NEXT:    .cfi_def_cfa_register %rbp
-; AVX512F-NEXT:    andq $-32, %rsp
-; AVX512F-NEXT:    subq $64, %rsp
 ; AVX512F-NEXT:    vpcmpgtb %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm2
 ; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k0
-; AVX512F-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT:    kmovw %k0, (%rsp)
+; AVX512F-NEXT:    kmovw %k0, %ecx
+; AVX512F-NEXT:    shll $16, %ecx
+; AVX512F-NEXT:    orl %eax, %ecx
 ; AVX512F-NEXT:    vpcmpgtb %ymm3, %ymm1, %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm1
 ; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; AVX512F-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    kmovw %k0, %edx
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movl (%rsp), %ecx
-; AVX512F-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    shll $16, %eax
+; AVX512F-NEXT:    orl %edx, %eax
 ; AVX512F-NEXT:    shlq $32, %rax
 ; AVX512F-NEXT:    orq %rcx, %rax
-; AVX512F-NEXT:    movq %rbp, %rsp
-; AVX512F-NEXT:    popq %rbp
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;

Modified: llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll?rev=323201&r1=323200&r2=323201&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll (original)
+++ llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll Tue Jan 23 06:25:39 2018
@@ -132,22 +132,70 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7
 ; AVX256VL:       # %bb.0:
 ; AVX256VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX256VL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
-; AVX256VL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
-; AVX256VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX256VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
-; AVX256VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0]
-; AVX256VL-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX256VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX256VL-NEXT:    vpmovsxbw %xmm1, %ymm1
+; AVX256VL-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX256VL-NEXT:    vptestmd %ymm1, %ymm1, %k1
+; AVX256VL-NEXT:    vpmovsxbw %xmm0, %ymm0
+; AVX256VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX256VL-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX256VL-NEXT:    vptestmd %ymm1, %ymm1, %k2
+; AVX256VL-NEXT:    vpmovsxwd %xmm0, %ymm0
+; AVX256VL-NEXT:    vptestmd %ymm0, %ymm0, %k3
+; AVX256VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX256VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k3} {z}
+; AVX256VL-NEXT:    vpmovdw %ymm1, %xmm1
+; AVX256VL-NEXT:    vmovdqa32 %ymm0, %ymm2 {%k2} {z}
+; AVX256VL-NEXT:    vpmovdw %ymm2, %xmm2
+; AVX256VL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX256VL-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX256VL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
+; AVX256VL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[6,7,12,13,u,u,8,9,6,7,14,15,14,15,0,1,22,23,28,29,18,19,26,27,22,23,u,u,30,31,16,17]
+; AVX256VL-NEXT:    vmovdqa32 %ymm0, %ymm2 {%k1} {z}
+; AVX256VL-NEXT:    vpmovdw %ymm2, %xmm2
+; AVX256VL-NEXT:    kshiftrw $8, %k1, %k1
+; AVX256VL-NEXT:    vmovdqa32 %ymm0, %ymm3 {%k1} {z}
+; AVX256VL-NEXT:    vpmovdw %ymm3, %xmm3
+; AVX256VL-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX256VL-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[1,1,2,1]
+; AVX256VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
+; AVX256VL-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
+; AVX256VL-NEXT:    vpmovsxwd %xmm1, %ymm2
+; AVX256VL-NEXT:    vpslld $31, %ymm2, %ymm2
+; AVX256VL-NEXT:    vptestmd %ymm2, %ymm2, %k1
+; AVX256VL-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX256VL-NEXT:    vpmovsxwd %xmm1, %ymm1
+; AVX256VL-NEXT:    vpslld $31, %ymm1, %ymm1
+; AVX256VL-NEXT:    vptestmd %ymm1, %ymm1, %k0
+; AVX256VL-NEXT:    kunpckbw %k1, %k0, %k0
+; AVX256VL-NEXT:    kshiftrw $8, %k0, %k2
+; AVX256VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k2} {z}
+; AVX256VL-NEXT:    vpmovdw %ymm1, %xmm1
+; AVX256VL-NEXT:    vpacksswb %xmm0, %xmm1, %xmm1
+; AVX256VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
+; AVX256VL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX256VL-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
+; AVX256VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX256VL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX256VL-NEXT:    retq
 ;
 ; AVX512NOBW-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
 ; AVX512NOBW:       # %bb.0:
 ; AVX512NOBW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512NOBW-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
-; AVX512NOBW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[3,6,u,12,3,7,7,0,3,6,1,13,3,u,7,0,u,u,22,u,u,u,u,u,u,u,u,u,u,21,u,u]
-; AVX512NOBW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX512NOBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,6,u,u,u,u,u,u,u,u,u,u,5,u,u,19,22,u,28,19,23,23,16,19,22,17,29,19,u,23,16]
-; AVX512NOBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,255,255,255,255,255,255,255,255,255,255,0,255,255,0,0,255,0,0,0,0,0,0,0,0,0,0,255,0,0]
-; AVX512NOBW-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512NOBW-NEXT:    vpmovsxbd %xmm0, %zmm1
+; AVX512NOBW-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512NOBW-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512NOBW-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512NOBW-NEXT:    vptestmd %zmm0, %zmm0, %k2
+; AVX512NOBW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; AVX512NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512NOBW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
+; AVX512NOBW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
+; AVX512NOBW-NEXT:    vptestmd %zmm2, %zmm2, %k1
+; AVX512NOBW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512NOBW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512NOBW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512NOBW-NEXT:    retq
 ;
 ; AVX256VLBW-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:

Modified: llvm/trunk/test/CodeGen/X86/vector-compare-results.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-compare-results.ll?rev=323201&r1=323200&r2=323201&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-compare-results.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-compare-results.ll Tue Jan 23 06:25:39 2018
@@ -2257,23 +2257,25 @@ define <64 x i1> @test_cmp_v64i8(<64 x i
 ;
 ; AVX512F-LABEL: test_cmp_v64i8:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpcmpgtb %ymm3, %ymm1, %ymm4
 ; AVX512F-NEXT:    vpcmpgtb %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT:    vextracti128 $1, %ymm4, %xmm3
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; AVX512F-NEXT:    vpcmpgtb %ymm3, %ymm1, %ymm2
+; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512F-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0
-; AVX512F-NEXT:    vmovdqa %xmm4, %xmm2
+; AVX512F-NEXT:    vmovdqa %xmm4, %xmm1
+; AVX512F-NEXT:    # kill: def %xmm2 killed %xmm2 killed %ymm2
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: test_cmp_v64i8:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpcmpgtb %ymm3, %ymm1, %ymm4
 ; AVX512DQ-NEXT:    vpcmpgtb %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm3
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm4
+; AVX512DQ-NEXT:    vpcmpgtb %ymm3, %ymm1, %ymm2
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512DQ-NEXT:    # kill: def %xmm0 killed %xmm0 killed %ymm0
-; AVX512DQ-NEXT:    vmovdqa %xmm4, %xmm2
+; AVX512DQ-NEXT:    vmovdqa %xmm4, %xmm1
+; AVX512DQ-NEXT:    # kill: def %xmm2 killed %xmm2 killed %ymm2
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
@@ -2711,32 +2713,24 @@ define <32 x i1> @test_cmp_v32f32(<32 x
 ;
 ; AVX512F-LABEL: test_cmp_v32f32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vcmpltps %zmm0, %zmm2, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vcmpltps %zmm1, %zmm3, %k1
+; AVX512F-NEXT:    vcmpltps %zmm0, %zmm2, %k2
+; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm0
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: test_cmp_v32f32:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vcmpltps %zmm0, %zmm2, %k0
-; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vcmpltps %zmm1, %zmm3, %k0
+; AVX512DQ-NEXT:    vcmpltps %zmm0, %zmm2, %k1
+; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
 ; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpsllw $7, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_cmp_v32f32:
@@ -3262,32 +3256,24 @@ define <32 x i1> @test_cmp_v32i32(<32 x
 ;
 ; AVX512F-LABEL: test_cmp_v32i32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpcmpgtd %zmm2, %zmm0, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpcmpgtd %zmm3, %zmm1, %k1
+; AVX512F-NEXT:    vpcmpgtd %zmm2, %zmm0, %k2
+; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm0
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: test_cmp_v32i32:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpcmpgtd %zmm2, %zmm0, %k0
-; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vpcmpgtd %zmm3, %zmm1, %k0
+; AVX512DQ-NEXT:    vpcmpgtd %zmm2, %zmm0, %k1
+; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
 ; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpsllw $7, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_cmp_v32i32:
@@ -6386,36 +6372,36 @@ define <128 x i1> @test_cmp_v128i8(<128
 ; AVX512F-LABEL: test_cmp_v128i8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpcmpgtb %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT:    vpcmpgtb %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT:    vpcmpgtb %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT:    vpcmpgtb %ymm7, %ymm3, %ymm3
-; AVX512F-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512F-NEXT:    vpmovsxbd %xmm4, %zmm4
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm4
 ; AVX512F-NEXT:    vptestmd %zmm4, %zmm4, %k0
-; AVX512F-NEXT:    kmovw %k0, 14(%rdi)
-; AVX512F-NEXT:    vpmovsxbd %xmm3, %zmm3
-; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k0
-; AVX512F-NEXT:    kmovw %k0, 12(%rdi)
-; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512F-NEXT:    vpmovsxbd %xmm3, %zmm3
-; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k0
-; AVX512F-NEXT:    kmovw %k0, 10(%rdi)
-; AVX512F-NEXT:    vpmovsxbd %xmm2, %zmm2
-; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k0
-; AVX512F-NEXT:    kmovw %k0, 8(%rdi)
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT:    vpmovsxbd %xmm2, %zmm2
-; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k0
-; AVX512F-NEXT:    kmovw %k0, 6(%rdi)
-; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
-; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; AVX512F-NEXT:    kmovw %k0, 4(%rdi)
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
-; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; AVX512F-NEXT:    kmovw %k0, 2(%rdi)
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vpcmpgtb %ymm5, %ymm1, %ymm0
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm1
+; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k2
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k3
+; AVX512F-NEXT:    vpcmpgtb %ymm6, %ymm2, %ymm0
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm1
+; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k4
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k5
+; AVX512F-NEXT:    vpcmpgtb %ymm7, %ymm3, %ymm0
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm1
+; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k6
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k7
+; AVX512F-NEXT:    kmovw %k7, 14(%rdi)
+; AVX512F-NEXT:    kmovw %k6, 12(%rdi)
+; AVX512F-NEXT:    kmovw %k5, 10(%rdi)
+; AVX512F-NEXT:    kmovw %k4, 8(%rdi)
+; AVX512F-NEXT:    kmovw %k3, 6(%rdi)
+; AVX512F-NEXT:    kmovw %k2, 4(%rdi)
+; AVX512F-NEXT:    kmovw %k1, 2(%rdi)
 ; AVX512F-NEXT:    kmovw %k0, (%rdi)
 ; AVX512F-NEXT:    movq %rdi, %rax
 ; AVX512F-NEXT:    vzeroupper
@@ -6424,36 +6410,36 @@ define <128 x i1> @test_cmp_v128i8(<128
 ; AVX512DQ-LABEL: test_cmp_v128i8:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vpcmpgtb %ymm4, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpcmpgtb %ymm5, %ymm1, %ymm1
-; AVX512DQ-NEXT:    vpcmpgtb %ymm6, %ymm2, %ymm2
-; AVX512DQ-NEXT:    vpcmpgtb %ymm7, %ymm3, %ymm3
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX512DQ-NEXT:    vpmovsxbd %xmm4, %zmm4
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm4
 ; AVX512DQ-NEXT:    vptestmd %zmm4, %zmm4, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 14(%rdi)
-; AVX512DQ-NEXT:    vpmovsxbd %xmm3, %zmm3
-; AVX512DQ-NEXT:    vptestmd %zmm3, %zmm3, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 12(%rdi)
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-NEXT:    vpmovsxbd %xmm3, %zmm3
-; AVX512DQ-NEXT:    vptestmd %zmm3, %zmm3, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 10(%rdi)
-; AVX512DQ-NEXT:    vpmovsxbd %xmm2, %zmm2
-; AVX512DQ-NEXT:    vptestmd %zmm2, %zmm2, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 8(%rdi)
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX512DQ-NEXT:    vpmovsxbd %xmm2, %zmm2
-; AVX512DQ-NEXT:    vptestmd %zmm2, %zmm2, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 6(%rdi)
-; AVX512DQ-NEXT:    vpmovsxbd %xmm1, %zmm1
-; AVX512DQ-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 4(%rdi)
-; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT:    vpmovsxbd %xmm1, %zmm1
-; AVX512DQ-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; AVX512DQ-NEXT:    kmovw %k0, 2(%rdi)
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512DQ-NEXT:    vpcmpgtb %ymm5, %ymm1, %ymm0
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm1
+; AVX512DQ-NEXT:    vptestmd %zmm1, %zmm1, %k2
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vptestmd %zmm0, %zmm0, %k3
+; AVX512DQ-NEXT:    vpcmpgtb %ymm6, %ymm2, %ymm0
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm1
+; AVX512DQ-NEXT:    vptestmd %zmm1, %zmm1, %k4
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
-; AVX512DQ-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT:    vptestmd %zmm0, %zmm0, %k5
+; AVX512DQ-NEXT:    vpcmpgtb %ymm7, %ymm3, %ymm0
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm1
+; AVX512DQ-NEXT:    vptestmd %zmm1, %zmm1, %k6
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vptestmd %zmm0, %zmm0, %k7
+; AVX512DQ-NEXT:    kmovw %k7, 14(%rdi)
+; AVX512DQ-NEXT:    kmovw %k6, 12(%rdi)
+; AVX512DQ-NEXT:    kmovw %k5, 10(%rdi)
+; AVX512DQ-NEXT:    kmovw %k4, 8(%rdi)
+; AVX512DQ-NEXT:    kmovw %k3, 6(%rdi)
+; AVX512DQ-NEXT:    kmovw %k2, 4(%rdi)
+; AVX512DQ-NEXT:    kmovw %k1, 2(%rdi)
 ; AVX512DQ-NEXT:    kmovw %k0, (%rdi)
 ; AVX512DQ-NEXT:    movq %rdi, %rax
 ; AVX512DQ-NEXT:    vzeroupper
@@ -6910,40 +6896,32 @@ define <32 x i1> @test_cmp_v32f64(<32 x
 ;
 ; AVX512F-LABEL: test_cmp_v32f64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vcmpltpd %zmm0, %zmm4, %k0
-; AVX512F-NEXT:    vcmpltpd %zmm1, %zmm5, %k1
-; AVX512F-NEXT:    kunpckbw %k0, %k1, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vcmpltpd %zmm2, %zmm6, %k0
 ; AVX512F-NEXT:    vcmpltpd %zmm3, %zmm7, %k1
 ; AVX512F-NEXT:    kunpckbw %k0, %k1, %k1
+; AVX512F-NEXT:    vcmpltpd %zmm0, %zmm4, %k0
+; AVX512F-NEXT:    vcmpltpd %zmm1, %zmm5, %k2
+; AVX512F-NEXT:    kunpckbw %k0, %k2, %k2
+; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm0
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: test_cmp_v32f64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vcmpltpd %zmm0, %zmm4, %k0
-; AVX512DQ-NEXT:    vcmpltpd %zmm1, %zmm5, %k1
-; AVX512DQ-NEXT:    kunpckbw %k0, %k1, %k0
-; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vcmpltpd %zmm2, %zmm6, %k0
 ; AVX512DQ-NEXT:    vcmpltpd %zmm3, %zmm7, %k1
 ; AVX512DQ-NEXT:    kunpckbw %k0, %k1, %k0
+; AVX512DQ-NEXT:    vcmpltpd %zmm0, %zmm4, %k1
+; AVX512DQ-NEXT:    vcmpltpd %zmm1, %zmm5, %k2
+; AVX512DQ-NEXT:    kunpckbw %k1, %k2, %k1
+; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
 ; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpsllw $7, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_cmp_v32f64:
@@ -7561,40 +7539,32 @@ define <32 x i1> @test_cmp_v32i64(<32 x
 ;
 ; AVX512F-LABEL: test_cmp_v32i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpcmpgtq %zmm4, %zmm0, %k0
-; AVX512F-NEXT:    vpcmpgtq %zmm5, %zmm1, %k1
-; AVX512F-NEXT:    kunpckbw %k0, %k1, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpcmpgtq %zmm6, %zmm2, %k0
 ; AVX512F-NEXT:    vpcmpgtq %zmm7, %zmm3, %k1
 ; AVX512F-NEXT:    kunpckbw %k0, %k1, %k1
+; AVX512F-NEXT:    vpcmpgtq %zmm4, %zmm0, %k0
+; AVX512F-NEXT:    vpcmpgtq %zmm5, %zmm1, %k2
+; AVX512F-NEXT:    kunpckbw %k0, %k2, %k2
+; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm0
-; AVX512F-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: test_cmp_v32i64:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vpcmpgtq %zmm4, %zmm0, %k0
-; AVX512DQ-NEXT:    vpcmpgtq %zmm5, %zmm1, %k1
-; AVX512DQ-NEXT:    kunpckbw %k0, %k1, %k0
-; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vpcmpgtq %zmm6, %zmm2, %k0
 ; AVX512DQ-NEXT:    vpcmpgtq %zmm7, %zmm3, %k1
 ; AVX512DQ-NEXT:    kunpckbw %k0, %k1, %k0
+; AVX512DQ-NEXT:    vpcmpgtq %zmm4, %zmm0, %k1
+; AVX512DQ-NEXT:    vpcmpgtq %zmm5, %zmm1, %k2
+; AVX512DQ-NEXT:    kunpckbw %k1, %k2, %k1
+; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
 ; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpsllw $7, %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_cmp_v32i64:




More information about the llvm-commits mailing list