[llvm] r311879 - [X86][Haswell] Updating HSW instruction scheduling information

Gadi Haber via llvm-commits llvm-commits at lists.llvm.org
Mon Aug 28 03:04:17 PDT 2017


Modified: llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll?rev=311879&r1=311878&r2=311879&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll Mon Aug 28 03:04:16 2017
@@ -19,23 +19,8 @@ define zeroext i32 @test_vpcmpeqb_v16i1_
 ; NoVLX-NEXT:    movq %rsp, %rbp
 ; NoVLX-NEXT:  .Lcfi2:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi3:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi4:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi5:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi6:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi7:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -44,64 +29,64 @@ define zeroext i32 @test_vpcmpeqb_v16i1_
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -110,12 +95,7 @@ define zeroext i32 @test_vpcmpeqb_v16i1_
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -137,30 +117,15 @@ define zeroext i32 @test_vpcmpeqb_v16i1_
 ; NoVLX-LABEL: test_vpcmpeqb_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi8:
+; NoVLX-NEXT:  .Lcfi3:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi9:
+; NoVLX-NEXT:  .Lcfi4:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi10:
+; NoVLX-NEXT:  .Lcfi5:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi11:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi12:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi13:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi14:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi15:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqb (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -169,64 +134,64 @@ define zeroext i32 @test_vpcmpeqb_v16i1_
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -235,12 +200,7 @@ define zeroext i32 @test_vpcmpeqb_v16i1_
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -264,30 +224,15 @@ define zeroext i32 @test_masked_vpcmpeqb
 ; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi16:
+; NoVLX-NEXT:  .Lcfi6:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi17:
+; NoVLX-NEXT:  .Lcfi7:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi18:
+; NoVLX-NEXT:  .Lcfi8:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi19:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi20:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi21:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi22:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi23:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -297,64 +242,64 @@ define zeroext i32 @test_masked_vpcmpeqb
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -363,12 +308,7 @@ define zeroext i32 @test_masked_vpcmpeqb
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -393,30 +333,15 @@ define zeroext i32 @test_masked_vpcmpeqb
 ; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi24:
+; NoVLX-NEXT:  .Lcfi9:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi25:
+; NoVLX-NEXT:  .Lcfi10:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi26:
+; NoVLX-NEXT:  .Lcfi11:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi27:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi28:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi29:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi30:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi31:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqb (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -426,64 +351,64 @@ define zeroext i32 @test_masked_vpcmpeqb
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -492,12 +417,7 @@ define zeroext i32 @test_masked_vpcmpeqb
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -523,12 +443,12 @@ define zeroext i64 @test_vpcmpeqb_v16i1_
 ; NoVLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi32:
+; NoVLX-NEXT:  .Lcfi12:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi33:
+; NoVLX-NEXT:  .Lcfi13:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi34:
+; NoVLX-NEXT:  .Lcfi14:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -537,24 +457,20 @@ define zeroext i64 @test_vpcmpeqb_v16i1_
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi35:
+; NoVLX-NEXT:  .Lcfi15:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi36:
+; NoVLX-NEXT:  .Lcfi16:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi37:
+; NoVLX-NEXT:  .Lcfi17:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi38:
+; NoVLX-NEXT:  .Lcfi18:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi39:
+; NoVLX-NEXT:  .Lcfi19:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -597,11 +513,11 @@ define zeroext i64 @test_vpcmpeqb_v16i1_
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -613,11 +529,15 @@ define zeroext i64 @test_vpcmpeqb_v16i1_
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -653,12 +573,12 @@ define zeroext i64 @test_vpcmpeqb_v16i1_
 ; NoVLX-LABEL: test_vpcmpeqb_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi40:
+; NoVLX-NEXT:  .Lcfi20:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi41:
+; NoVLX-NEXT:  .Lcfi21:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi42:
+; NoVLX-NEXT:  .Lcfi22:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -667,24 +587,20 @@ define zeroext i64 @test_vpcmpeqb_v16i1_
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi43:
+; NoVLX-NEXT:  .Lcfi23:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi44:
+; NoVLX-NEXT:  .Lcfi24:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi45:
+; NoVLX-NEXT:  .Lcfi25:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi46:
+; NoVLX-NEXT:  .Lcfi26:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi47:
+; NoVLX-NEXT:  .Lcfi27:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqb (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -727,11 +643,11 @@ define zeroext i64 @test_vpcmpeqb_v16i1_
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -743,11 +659,15 @@ define zeroext i64 @test_vpcmpeqb_v16i1_
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -785,12 +705,12 @@ define zeroext i64 @test_masked_vpcmpeqb
 ; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi48:
+; NoVLX-NEXT:  .Lcfi28:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi49:
+; NoVLX-NEXT:  .Lcfi29:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi50:
+; NoVLX-NEXT:  .Lcfi30:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -799,25 +719,21 @@ define zeroext i64 @test_masked_vpcmpeqb
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi51:
+; NoVLX-NEXT:  .Lcfi31:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi52:
+; NoVLX-NEXT:  .Lcfi32:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi53:
+; NoVLX-NEXT:  .Lcfi33:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi54:
+; NoVLX-NEXT:  .Lcfi34:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi55:
+; NoVLX-NEXT:  .Lcfi35:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -860,11 +776,11 @@ define zeroext i64 @test_masked_vpcmpeqb
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -876,11 +792,15 @@ define zeroext i64 @test_masked_vpcmpeqb
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -919,12 +839,12 @@ define zeroext i64 @test_masked_vpcmpeqb
 ; NoVLX-LABEL: test_masked_vpcmpeqb_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi56:
+; NoVLX-NEXT:  .Lcfi36:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi57:
+; NoVLX-NEXT:  .Lcfi37:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi58:
+; NoVLX-NEXT:  .Lcfi38:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -933,25 +853,21 @@ define zeroext i64 @test_masked_vpcmpeqb
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi59:
+; NoVLX-NEXT:  .Lcfi39:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi60:
+; NoVLX-NEXT:  .Lcfi40:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi61:
+; NoVLX-NEXT:  .Lcfi41:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi62:
+; NoVLX-NEXT:  .Lcfi42:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi63:
+; NoVLX-NEXT:  .Lcfi43:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqb (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -994,11 +910,11 @@ define zeroext i64 @test_masked_vpcmpeqb
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -1010,11 +926,15 @@ define zeroext i64 @test_masked_vpcmpeqb
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -1055,12 +975,12 @@ define zeroext i64 @test_vpcmpeqb_v32i1_
 ; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi64:
+; NoVLX-NEXT:  .Lcfi44:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi65:
+; NoVLX-NEXT:  .Lcfi45:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi66:
+; NoVLX-NEXT:  .Lcfi46:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -1104,12 +1024,12 @@ define zeroext i64 @test_vpcmpeqb_v32i1_
 ; NoVLX-LABEL: test_vpcmpeqb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi67:
+; NoVLX-NEXT:  .Lcfi47:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi68:
+; NoVLX-NEXT:  .Lcfi48:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi69:
+; NoVLX-NEXT:  .Lcfi49:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -1155,12 +1075,12 @@ define zeroext i64 @test_masked_vpcmpeqb
 ; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi70:
+; NoVLX-NEXT:  .Lcfi50:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi71:
+; NoVLX-NEXT:  .Lcfi51:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi72:
+; NoVLX-NEXT:  .Lcfi52:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -1216,12 +1136,12 @@ define zeroext i64 @test_masked_vpcmpeqb
 ; NoVLX-LABEL: test_masked_vpcmpeqb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi73:
+; NoVLX-NEXT:  .Lcfi53:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi74:
+; NoVLX-NEXT:  .Lcfi54:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi75:
+; NoVLX-NEXT:  .Lcfi55:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -1400,12 +1320,12 @@ define zeroext i32 @test_vpcmpeqw_v8i1_v
 ; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi76:
+; NoVLX-NEXT:  .Lcfi56:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi77:
+; NoVLX-NEXT:  .Lcfi57:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi78:
+; NoVLX-NEXT:  .Lcfi58:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -1475,12 +1395,12 @@ define zeroext i32 @test_vpcmpeqw_v8i1_v
 ; NoVLX-LABEL: test_vpcmpeqw_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi79:
+; NoVLX-NEXT:  .Lcfi59:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi80:
+; NoVLX-NEXT:  .Lcfi60:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi81:
+; NoVLX-NEXT:  .Lcfi61:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -1552,12 +1472,12 @@ define zeroext i32 @test_masked_vpcmpeqw
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi82:
+; NoVLX-NEXT:  .Lcfi62:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi83:
+; NoVLX-NEXT:  .Lcfi63:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi84:
+; NoVLX-NEXT:  .Lcfi64:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -1631,12 +1551,12 @@ define zeroext i32 @test_masked_vpcmpeqw
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi85:
+; NoVLX-NEXT:  .Lcfi65:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi86:
+; NoVLX-NEXT:  .Lcfi66:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi87:
+; NoVLX-NEXT:  .Lcfi67:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -1711,12 +1631,12 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v
 ; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi88:
+; NoVLX-NEXT:  .Lcfi68:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi89:
+; NoVLX-NEXT:  .Lcfi69:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi90:
+; NoVLX-NEXT:  .Lcfi70:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -1724,43 +1644,43 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -1791,12 +1711,12 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v
 ; NoVLX-LABEL: test_vpcmpeqw_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi91:
+; NoVLX-NEXT:  .Lcfi71:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi92:
+; NoVLX-NEXT:  .Lcfi72:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi93:
+; NoVLX-NEXT:  .Lcfi73:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -1804,43 +1724,43 @@ define zeroext i64 @test_vpcmpeqw_v8i1_v
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -1873,12 +1793,12 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi94:
+; NoVLX-NEXT:  .Lcfi74:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi95:
+; NoVLX-NEXT:  .Lcfi75:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi96:
+; NoVLX-NEXT:  .Lcfi76:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -1887,43 +1807,43 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -1957,12 +1877,12 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi97:
+; NoVLX-NEXT:  .Lcfi77:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi98:
+; NoVLX-NEXT:  .Lcfi78:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi99:
+; NoVLX-NEXT:  .Lcfi79:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -1971,43 +1891,43 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -2043,30 +1963,15 @@ define zeroext i32 @test_vpcmpeqw_v16i1_
 ; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi100:
+; NoVLX-NEXT:  .Lcfi80:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi101:
+; NoVLX-NEXT:  .Lcfi81:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi102:
+; NoVLX-NEXT:  .Lcfi82:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi103:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi104:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi105:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi106:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi107:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -2075,64 +1980,64 @@ define zeroext i32 @test_vpcmpeqw_v16i1_
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -2141,12 +2046,7 @@ define zeroext i32 @test_vpcmpeqw_v16i1_
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -2169,30 +2069,15 @@ define zeroext i32 @test_vpcmpeqw_v16i1_
 ; NoVLX-LABEL: test_vpcmpeqw_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi108:
+; NoVLX-NEXT:  .Lcfi83:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi109:
+; NoVLX-NEXT:  .Lcfi84:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi110:
+; NoVLX-NEXT:  .Lcfi85:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi111:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi112:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi113:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi114:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi115:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqw (%rdi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -2201,64 +2086,64 @@ define zeroext i32 @test_vpcmpeqw_v16i1_
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -2267,12 +2152,7 @@ define zeroext i32 @test_vpcmpeqw_v16i1_
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -2297,30 +2177,15 @@ define zeroext i32 @test_masked_vpcmpeqw
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi116:
+; NoVLX-NEXT:  .Lcfi86:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi117:
+; NoVLX-NEXT:  .Lcfi87:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi118:
+; NoVLX-NEXT:  .Lcfi88:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi119:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi120:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi121:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi122:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi123:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -2330,64 +2195,64 @@ define zeroext i32 @test_masked_vpcmpeqw
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -2396,12 +2261,7 @@ define zeroext i32 @test_masked_vpcmpeqw
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -2427,30 +2287,15 @@ define zeroext i32 @test_masked_vpcmpeqw
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi124:
+; NoVLX-NEXT:  .Lcfi89:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi125:
+; NoVLX-NEXT:  .Lcfi90:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi126:
+; NoVLX-NEXT:  .Lcfi91:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi127:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi128:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi129:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi130:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi131:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqw (%rsi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -2460,64 +2305,64 @@ define zeroext i32 @test_masked_vpcmpeqw
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -2526,12 +2371,7 @@ define zeroext i32 @test_masked_vpcmpeqw
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -2558,12 +2398,12 @@ define zeroext i64 @test_vpcmpeqw_v16i1_
 ; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi132:
+; NoVLX-NEXT:  .Lcfi92:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi133:
+; NoVLX-NEXT:  .Lcfi93:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi134:
+; NoVLX-NEXT:  .Lcfi94:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -2572,24 +2412,20 @@ define zeroext i64 @test_vpcmpeqw_v16i1_
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi135:
+; NoVLX-NEXT:  .Lcfi95:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi136:
+; NoVLX-NEXT:  .Lcfi96:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi137:
+; NoVLX-NEXT:  .Lcfi97:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi138:
+; NoVLX-NEXT:  .Lcfi98:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi139:
+; NoVLX-NEXT:  .Lcfi99:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -2632,11 +2468,11 @@ define zeroext i64 @test_vpcmpeqw_v16i1_
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -2648,11 +2484,15 @@ define zeroext i64 @test_vpcmpeqw_v16i1_
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -2689,12 +2529,12 @@ define zeroext i64 @test_vpcmpeqw_v16i1_
 ; NoVLX-LABEL: test_vpcmpeqw_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi140:
+; NoVLX-NEXT:  .Lcfi100:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi141:
+; NoVLX-NEXT:  .Lcfi101:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi142:
+; NoVLX-NEXT:  .Lcfi102:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -2703,24 +2543,20 @@ define zeroext i64 @test_vpcmpeqw_v16i1_
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi143:
+; NoVLX-NEXT:  .Lcfi103:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi144:
+; NoVLX-NEXT:  .Lcfi104:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi145:
+; NoVLX-NEXT:  .Lcfi105:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi146:
+; NoVLX-NEXT:  .Lcfi106:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi147:
+; NoVLX-NEXT:  .Lcfi107:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqw (%rdi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -2763,11 +2599,11 @@ define zeroext i64 @test_vpcmpeqw_v16i1_
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -2779,11 +2615,15 @@ define zeroext i64 @test_vpcmpeqw_v16i1_
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -2822,12 +2662,12 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi148:
+; NoVLX-NEXT:  .Lcfi108:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi149:
+; NoVLX-NEXT:  .Lcfi109:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi150:
+; NoVLX-NEXT:  .Lcfi110:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -2836,25 +2676,21 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi151:
+; NoVLX-NEXT:  .Lcfi111:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi152:
+; NoVLX-NEXT:  .Lcfi112:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi153:
+; NoVLX-NEXT:  .Lcfi113:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi154:
+; NoVLX-NEXT:  .Lcfi114:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi155:
+; NoVLX-NEXT:  .Lcfi115:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -2897,11 +2733,11 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -2913,11 +2749,15 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -2957,12 +2797,12 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi156:
+; NoVLX-NEXT:  .Lcfi116:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi157:
+; NoVLX-NEXT:  .Lcfi117:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi158:
+; NoVLX-NEXT:  .Lcfi118:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -2971,25 +2811,21 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi159:
+; NoVLX-NEXT:  .Lcfi119:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi160:
+; NoVLX-NEXT:  .Lcfi120:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi161:
+; NoVLX-NEXT:  .Lcfi121:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi162:
+; NoVLX-NEXT:  .Lcfi122:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi163:
+; NoVLX-NEXT:  .Lcfi123:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqw (%rsi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -3032,11 +2868,11 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -3048,11 +2884,15 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -3093,62 +2933,58 @@ define zeroext i64 @test_vpcmpeqw_v32i1_
 ; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi164:
+; NoVLX-NEXT:  .Lcfi124:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi165:
+; NoVLX-NEXT:  .Lcfi125:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi166:
+; NoVLX-NEXT:  .Lcfi126:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
-; NoVLX-NEXT:    vmovq %xmm3, %rax
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    vmovd %eax, %xmm3
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm5
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm8
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm6
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm7
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm9
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm2, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm5
@@ -3156,190 +2992,194 @@ define zeroext i64 @test_vpcmpeqw_v32i1_
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm4
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm7, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm6
 ; NoVLX-NEXT:    vmovq %xmm6, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm7
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm5
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    vmovq %xmm1, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm6
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm6
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT:    vmovq %xmm7, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm8
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm8, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm1, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm7
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm3
-; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vinserti128 $1, %xmm9, %ymm3, %ymm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm3
+; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm8, %ymm1
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm3, %ymm1
-; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vinserti128 $1, %xmm7, %ymm2, %ymm2
+; NoVLX-NEXT:    vpcmpeqw %ymm2, %ymm3, %ymm2
+; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
@@ -3444,69 +3284,68 @@ define zeroext i64 @test_vpcmpeqw_v32i1_
 ; NoVLX-LABEL: test_vpcmpeqw_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi167:
+; NoVLX-NEXT:  .Lcfi127:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi168:
+; NoVLX-NEXT:  .Lcfi128:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi169:
+; NoVLX-NEXT:  .Lcfi129:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
-; NoVLX-NEXT:    vmovq %xmm2, %rax
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm1
+; NoVLX-NEXT:    vmovq %xmm1, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm1
+; NoVLX-NEXT:    vmovd %eax, %xmm2
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm3
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
+; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
@@ -3514,7 +3353,8 @@ define zeroext i64 @test_vpcmpeqw_v32i1_
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
@@ -3524,19 +3364,19 @@ define zeroext i64 @test_vpcmpeqw_v32i1_
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm4
+; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm0
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm1
 ; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpeqw 32(%rdi), %ymm1, %ymm1
 ; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
@@ -3712,12 +3552,12 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi170:
+; NoVLX-NEXT:  .Lcfi130:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi171:
+; NoVLX-NEXT:  .Lcfi131:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi172:
+; NoVLX-NEXT:  .Lcfi132:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -3728,17 +3568,12 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-NEXT:    movq %rax, %rdx
 ; NoVLX-NEXT:    vmovd %eax, %xmm3
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm4
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm8
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm6
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
 ; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
@@ -3746,9 +3581,10 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
 ; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm9
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm8
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm4
@@ -3766,39 +3602,40 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm6, %rcx
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm4
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm4
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm4, %xmm4
 ; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm6
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
@@ -3806,69 +3643,72 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm7, %rcx
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm5
+; NoVLX-NEXT:    vmovq %xmm5, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm6
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm6, %xmm6
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm6, %xmm6
+; NoVLX-NEXT:    vpextrq $1, %xmm5, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm6, %xmm5
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm5, %xmm5
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm5, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm6
+; NoVLX-NEXT:    vmovq %xmm6, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm7
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm7
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm7, %xmm7
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm7, %xmm7
+; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm7, %xmm6
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm6, %xmm6
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm6, %xmm6
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm8, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm6, %xmm6
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm7
+; NoVLX-NEXT:    vmovq %xmm7, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm6, %xmm6
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vmovq %xmm1, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm7
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm2
@@ -3877,18 +3717,13 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
-; NoVLX-NEXT:    vinserti128 $1, %xmm9, %ymm4, %ymm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm4
+; NoVLX-NEXT:    vinserti128 $1, %xmm8, %ymm3, %ymm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm3
 ; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vinserti128 $1, %xmm7, %ymm3, %ymm3
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT:    vpcmpeqw %ymm3, %ymm1, %ymm3
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
 ; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm6, %ymm4
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
@@ -3897,147 +3732,152 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm2, %ymm2
-; NoVLX-NEXT:    vpcmpeqw %ymm2, %ymm4, %ymm2
-; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT:    vpcmpeqw %ymm4, %ymm1, %ymm2
+; NoVLX-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm1
+; NoVLX-NEXT:    vpcmpeqw %ymm1, %ymm3, %ymm1
+; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpmovsxwd %ymm3, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpand %xmm1, %xmm3, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -4075,12 +3915,12 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-LABEL: test_masked_vpcmpeqw_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi173:
+; NoVLX-NEXT:  .Lcfi133:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi174:
+; NoVLX-NEXT:  .Lcfi134:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi175:
+; NoVLX-NEXT:  .Lcfi135:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -4092,8 +3932,6 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-NEXT:    vmovd %eax, %xmm2
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
 ; NoVLX-NEXT:    shrq $32, %rdx
 ; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
@@ -4106,19 +3944,20 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
@@ -4126,6 +3965,7 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
 ; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
@@ -4148,174 +3988,174 @@ define zeroext i64 @test_masked_vpcmpeqw
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm5
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm3
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm2
-; NoVLX-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
-; NoVLX-NEXT:    vpcmpeqw (%rsi), %ymm2, %ymm2
-; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm4
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
+; NoVLX-NEXT:    vpcmpeqw (%rsi), %ymm3, %ymm3
+; NoVLX-NEXT:    vpmovsxwd %ymm3, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm3
+; NoVLX-NEXT:    vptestmd %zmm3, %zmm3, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    vmovd %eax, %xmm3
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm2
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpcmpeqw 32(%rsi), %ymm3, %ymm3
-; NoVLX-NEXT:    vpmovsxwd %ymm3, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm1
+; NoVLX-NEXT:    vpcmpeqw 32(%rsi), %ymm2, %ymm2
+; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpand %xmm1, %xmm3, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -4370,8 +4210,8 @@ define zeroext i8 @test_vpcmpeqd_v4i1_v8
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -4425,8 +4265,8 @@ define zeroext i8 @test_vpcmpeqd_v4i1_v8
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -4465,7 +4305,6 @@ define zeroext i8 @test_masked_vpcmpeqd_
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -4477,13 +4316,14 @@ define zeroext i8 @test_masked_vpcmpeqd_
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -4500,8 +4340,8 @@ define zeroext i8 @test_masked_vpcmpeqd_
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -4577,8 +4417,8 @@ define zeroext i8 @test_masked_vpcmpeqd_
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -4638,8 +4478,8 @@ define zeroext i8 @test_vpcmpeqd_v4i1_v8
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -4680,7 +4520,6 @@ define zeroext i8 @test_masked_vpcmpeqd_
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v8i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -4692,13 +4531,14 @@ define zeroext i8 @test_masked_vpcmpeqd_
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -4715,8 +4555,8 @@ define zeroext i8 @test_masked_vpcmpeqd_
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -4775,8 +4615,8 @@ define zeroext i16 @test_vpcmpeqd_v4i1_v
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -4829,8 +4669,8 @@ define zeroext i16 @test_vpcmpeqd_v4i1_v
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -4869,7 +4709,6 @@ define zeroext i16 @test_masked_vpcmpeqd
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -4881,13 +4720,14 @@ define zeroext i16 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -4903,8 +4743,8 @@ define zeroext i16 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -4979,8 +4819,8 @@ define zeroext i16 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -5039,8 +4879,8 @@ define zeroext i16 @test_vpcmpeqd_v4i1_v
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -5081,7 +4921,6 @@ define zeroext i16 @test_masked_vpcmpeqd
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v16i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -5093,13 +4932,14 @@ define zeroext i16 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -5115,8 +4955,8 @@ define zeroext i16 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -5159,12 +4999,12 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v
 ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi176:
+; NoVLX-NEXT:  .Lcfi136:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi177:
+; NoVLX-NEXT:  .Lcfi137:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi178:
+; NoVLX-NEXT:  .Lcfi138:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -5202,12 +5042,12 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v
 ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi179:
+; NoVLX-NEXT:  .Lcfi139:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi180:
+; NoVLX-NEXT:  .Lcfi140:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi181:
+; NoVLX-NEXT:  .Lcfi141:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -5247,16 +5087,15 @@ define zeroext i32 @test_masked_vpcmpeqd
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi182:
+; NoVLX-NEXT:  .Lcfi142:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi183:
+; NoVLX-NEXT:  .Lcfi143:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi184:
+; NoVLX-NEXT:  .Lcfi144:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -5268,13 +5107,14 @@ define zeroext i32 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -5312,12 +5152,12 @@ define zeroext i32 @test_masked_vpcmpeqd
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi185:
+; NoVLX-NEXT:  .Lcfi145:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi186:
+; NoVLX-NEXT:  .Lcfi146:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi187:
+; NoVLX-NEXT:  .Lcfi147:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -5378,12 +5218,12 @@ define zeroext i32 @test_vpcmpeqd_v4i1_v
 ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi188:
+; NoVLX-NEXT:  .Lcfi148:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi189:
+; NoVLX-NEXT:  .Lcfi149:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi190:
+; NoVLX-NEXT:  .Lcfi150:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -5425,17 +5265,16 @@ define zeroext i32 @test_masked_vpcmpeqd
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi191:
+; NoVLX-NEXT:  .Lcfi151:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi192:
+; NoVLX-NEXT:  .Lcfi152:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi193:
+; NoVLX-NEXT:  .Lcfi153:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -5447,13 +5286,14 @@ define zeroext i32 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -5493,20 +5333,20 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v
 ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi194:
+; NoVLX-NEXT:  .Lcfi154:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi195:
+; NoVLX-NEXT:  .Lcfi155:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi196:
+; NoVLX-NEXT:  .Lcfi156:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -5542,20 +5382,20 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v
 ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi197:
+; NoVLX-NEXT:  .Lcfi157:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi198:
+; NoVLX-NEXT:  .Lcfi158:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi199:
+; NoVLX-NEXT:  .Lcfi159:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqd (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -5593,19 +5433,18 @@ define zeroext i64 @test_masked_vpcmpeqd
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi200:
+; NoVLX-NEXT:  .Lcfi160:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi201:
+; NoVLX-NEXT:  .Lcfi161:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi202:
+; NoVLX-NEXT:  .Lcfi162:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -5618,13 +5457,14 @@ define zeroext i64 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -5664,19 +5504,18 @@ define zeroext i64 @test_masked_vpcmpeqd
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi203:
+; NoVLX-NEXT:  .Lcfi163:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi204:
+; NoVLX-NEXT:  .Lcfi164:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi205:
+; NoVLX-NEXT:  .Lcfi165:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqd (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -5689,13 +5528,14 @@ define zeroext i64 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -5736,12 +5576,12 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v
 ; NoVLX-LABEL: test_vpcmpeqd_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi206:
+; NoVLX-NEXT:  .Lcfi166:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi207:
+; NoVLX-NEXT:  .Lcfi167:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi208:
+; NoVLX-NEXT:  .Lcfi168:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -5749,8 +5589,8 @@ define zeroext i64 @test_vpcmpeqd_v4i1_v
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -5789,12 +5629,12 @@ define zeroext i64 @test_masked_vpcmpeqd
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi209:
+; NoVLX-NEXT:  .Lcfi169:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi210:
+; NoVLX-NEXT:  .Lcfi170:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi211:
+; NoVLX-NEXT:  .Lcfi171:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -5802,7 +5642,6 @@ define zeroext i64 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -5815,13 +5654,14 @@ define zeroext i64 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -6052,12 +5892,12 @@ define zeroext i32 @test_vpcmpeqd_v8i1_v
 ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi212:
+; NoVLX-NEXT:  .Lcfi172:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi213:
+; NoVLX-NEXT:  .Lcfi173:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi214:
+; NoVLX-NEXT:  .Lcfi174:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -6127,12 +5967,12 @@ define zeroext i32 @test_vpcmpeqd_v8i1_v
 ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi215:
+; NoVLX-NEXT:  .Lcfi175:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi216:
+; NoVLX-NEXT:  .Lcfi176:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi217:
+; NoVLX-NEXT:  .Lcfi177:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -6204,12 +6044,12 @@ define zeroext i32 @test_masked_vpcmpeqd
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi218:
+; NoVLX-NEXT:  .Lcfi178:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi219:
+; NoVLX-NEXT:  .Lcfi179:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi220:
+; NoVLX-NEXT:  .Lcfi180:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -6284,12 +6124,12 @@ define zeroext i32 @test_masked_vpcmpeqd
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi221:
+; NoVLX-NEXT:  .Lcfi181:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi222:
+; NoVLX-NEXT:  .Lcfi182:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi223:
+; NoVLX-NEXT:  .Lcfi183:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -6365,12 +6205,12 @@ define zeroext i32 @test_vpcmpeqd_v8i1_v
 ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi224:
+; NoVLX-NEXT:  .Lcfi184:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi225:
+; NoVLX-NEXT:  .Lcfi185:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi226:
+; NoVLX-NEXT:  .Lcfi186:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -6443,12 +6283,12 @@ define zeroext i32 @test_masked_vpcmpeqd
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi227:
+; NoVLX-NEXT:  .Lcfi187:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi228:
+; NoVLX-NEXT:  .Lcfi188:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi229:
+; NoVLX-NEXT:  .Lcfi189:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -6525,55 +6365,55 @@ define zeroext i64 @test_vpcmpeqd_v8i1_v
 ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi230:
+; NoVLX-NEXT:  .Lcfi190:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi231:
+; NoVLX-NEXT:  .Lcfi191:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi232:
+; NoVLX-NEXT:  .Lcfi192:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
 ; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -6605,55 +6445,55 @@ define zeroext i64 @test_vpcmpeqd_v8i1_v
 ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi233:
+; NoVLX-NEXT:  .Lcfi193:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi234:
+; NoVLX-NEXT:  .Lcfi194:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi235:
+; NoVLX-NEXT:  .Lcfi195:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -6687,12 +6527,12 @@ define zeroext i64 @test_masked_vpcmpeqd
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi236:
+; NoVLX-NEXT:  .Lcfi196:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi237:
+; NoVLX-NEXT:  .Lcfi197:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi238:
+; NoVLX-NEXT:  .Lcfi198:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -6701,43 +6541,43 @@ define zeroext i64 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    kandw %k1, %k0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -6772,12 +6612,12 @@ define zeroext i64 @test_masked_vpcmpeqd
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi239:
+; NoVLX-NEXT:  .Lcfi199:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi240:
+; NoVLX-NEXT:  .Lcfi200:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi241:
+; NoVLX-NEXT:  .Lcfi201:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -6786,43 +6626,43 @@ define zeroext i64 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    kandw %k1, %k0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -6858,55 +6698,55 @@ define zeroext i64 @test_vpcmpeqd_v8i1_v
 ; NoVLX-LABEL: test_vpcmpeqd_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi242:
+; NoVLX-NEXT:  .Lcfi202:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi243:
+; NoVLX-NEXT:  .Lcfi203:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi244:
+; NoVLX-NEXT:  .Lcfi204:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -6941,12 +6781,12 @@ define zeroext i64 @test_masked_vpcmpeqd
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi245:
+; NoVLX-NEXT:  .Lcfi205:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi246:
+; NoVLX-NEXT:  .Lcfi206:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi247:
+; NoVLX-NEXT:  .Lcfi207:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -6955,43 +6795,43 @@ define zeroext i64 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    kandw %k0, %k1, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -7028,93 +6868,78 @@ define zeroext i32 @test_vpcmpeqd_v16i1_
 ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi248:
+; NoVLX-NEXT:  .Lcfi208:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi249:
+; NoVLX-NEXT:  .Lcfi209:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi250:
+; NoVLX-NEXT:  .Lcfi210:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi251:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi252:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi253:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi254:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi255:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -7123,12 +6948,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -7151,93 +6971,78 @@ define zeroext i32 @test_vpcmpeqd_v16i1_
 ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi256:
+; NoVLX-NEXT:  .Lcfi211:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi257:
+; NoVLX-NEXT:  .Lcfi212:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi258:
+; NoVLX-NEXT:  .Lcfi213:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi259:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi260:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi261:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi262:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi263:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqd (%rdi), %zmm0, %k0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -7246,12 +7051,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -7276,94 +7076,79 @@ define zeroext i32 @test_masked_vpcmpeqd
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi264:
+; NoVLX-NEXT:  .Lcfi214:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi265:
+; NoVLX-NEXT:  .Lcfi215:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi266:
+; NoVLX-NEXT:  .Lcfi216:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi267:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi268:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi269:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi270:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi271:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -7372,12 +7157,7 @@ define zeroext i32 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -7403,94 +7183,79 @@ define zeroext i32 @test_masked_vpcmpeqd
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi272:
+; NoVLX-NEXT:  .Lcfi217:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi273:
+; NoVLX-NEXT:  .Lcfi218:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi274:
+; NoVLX-NEXT:  .Lcfi219:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi275:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi276:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi277:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi278:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi279:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -7499,12 +7264,7 @@ define zeroext i32 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -7531,93 +7291,78 @@ define zeroext i32 @test_vpcmpeqd_v16i1_
 ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi280:
+; NoVLX-NEXT:  .Lcfi220:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi281:
+; NoVLX-NEXT:  .Lcfi221:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi282:
+; NoVLX-NEXT:  .Lcfi222:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi283:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi284:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi285:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi286:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi287:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqd (%rdi){1to16}, %zmm0, %k0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -7626,12 +7371,7 @@ define zeroext i32 @test_vpcmpeqd_v16i1_
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -7657,94 +7397,79 @@ define zeroext i32 @test_masked_vpcmpeqd
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi288:
+; NoVLX-NEXT:  .Lcfi223:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi289:
+; NoVLX-NEXT:  .Lcfi224:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi290:
+; NoVLX-NEXT:  .Lcfi225:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi291:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi292:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi293:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi294:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi295:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -7753,12 +7478,7 @@ define zeroext i32 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -7786,12 +7506,12 @@ define zeroext i64 @test_vpcmpeqd_v16i1_
 ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi296:
+; NoVLX-NEXT:  .Lcfi226:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi297:
+; NoVLX-NEXT:  .Lcfi227:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi298:
+; NoVLX-NEXT:  .Lcfi228:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -7800,21 +7520,17 @@ define zeroext i64 @test_vpcmpeqd_v16i1_
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi299:
+; NoVLX-NEXT:  .Lcfi229:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi300:
+; NoVLX-NEXT:  .Lcfi230:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi301:
+; NoVLX-NEXT:  .Lcfi231:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi302:
+; NoVLX-NEXT:  .Lcfi232:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi303:
+; NoVLX-NEXT:  .Lcfi233:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -7857,11 +7573,11 @@ define zeroext i64 @test_vpcmpeqd_v16i1_
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -7873,11 +7589,15 @@ define zeroext i64 @test_vpcmpeqd_v16i1_
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -7914,12 +7634,12 @@ define zeroext i64 @test_vpcmpeqd_v16i1_
 ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi304:
+; NoVLX-NEXT:  .Lcfi234:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi305:
+; NoVLX-NEXT:  .Lcfi235:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi306:
+; NoVLX-NEXT:  .Lcfi236:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -7928,21 +7648,17 @@ define zeroext i64 @test_vpcmpeqd_v16i1_
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi307:
+; NoVLX-NEXT:  .Lcfi237:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi308:
+; NoVLX-NEXT:  .Lcfi238:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi309:
+; NoVLX-NEXT:  .Lcfi239:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi310:
+; NoVLX-NEXT:  .Lcfi240:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi311:
+; NoVLX-NEXT:  .Lcfi241:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqd (%rdi), %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -7985,11 +7701,11 @@ define zeroext i64 @test_vpcmpeqd_v16i1_
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -8001,11 +7717,15 @@ define zeroext i64 @test_vpcmpeqd_v16i1_
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -8044,12 +7764,12 @@ define zeroext i64 @test_masked_vpcmpeqd
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi312:
+; NoVLX-NEXT:  .Lcfi242:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi313:
+; NoVLX-NEXT:  .Lcfi243:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi314:
+; NoVLX-NEXT:  .Lcfi244:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -8058,22 +7778,18 @@ define zeroext i64 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi315:
+; NoVLX-NEXT:  .Lcfi245:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi316:
+; NoVLX-NEXT:  .Lcfi246:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi317:
+; NoVLX-NEXT:  .Lcfi247:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi318:
+; NoVLX-NEXT:  .Lcfi248:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi319:
+; NoVLX-NEXT:  .Lcfi249:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -8116,11 +7832,11 @@ define zeroext i64 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -8132,11 +7848,15 @@ define zeroext i64 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -8176,12 +7896,12 @@ define zeroext i64 @test_masked_vpcmpeqd
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi320:
+; NoVLX-NEXT:  .Lcfi250:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi321:
+; NoVLX-NEXT:  .Lcfi251:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi322:
+; NoVLX-NEXT:  .Lcfi252:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -8190,22 +7910,18 @@ define zeroext i64 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi323:
+; NoVLX-NEXT:  .Lcfi253:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi324:
+; NoVLX-NEXT:  .Lcfi254:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi325:
+; NoVLX-NEXT:  .Lcfi255:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi326:
+; NoVLX-NEXT:  .Lcfi256:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi327:
+; NoVLX-NEXT:  .Lcfi257:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqd (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -8248,11 +7964,11 @@ define zeroext i64 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -8264,11 +7980,15 @@ define zeroext i64 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -8309,12 +8029,12 @@ define zeroext i64 @test_vpcmpeqd_v16i1_
 ; NoVLX-LABEL: test_vpcmpeqd_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi328:
+; NoVLX-NEXT:  .Lcfi258:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi329:
+; NoVLX-NEXT:  .Lcfi259:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi330:
+; NoVLX-NEXT:  .Lcfi260:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -8323,21 +8043,17 @@ define zeroext i64 @test_vpcmpeqd_v16i1_
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi331:
+; NoVLX-NEXT:  .Lcfi261:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi332:
+; NoVLX-NEXT:  .Lcfi262:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi333:
+; NoVLX-NEXT:  .Lcfi263:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi334:
+; NoVLX-NEXT:  .Lcfi264:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi335:
+; NoVLX-NEXT:  .Lcfi265:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpeqd (%rdi){1to16}, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -8380,11 +8096,11 @@ define zeroext i64 @test_vpcmpeqd_v16i1_
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -8396,11 +8112,15 @@ define zeroext i64 @test_vpcmpeqd_v16i1_
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -8440,12 +8160,12 @@ define zeroext i64 @test_masked_vpcmpeqd
 ; NoVLX-LABEL: test_masked_vpcmpeqd_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi336:
+; NoVLX-NEXT:  .Lcfi266:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi337:
+; NoVLX-NEXT:  .Lcfi267:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi338:
+; NoVLX-NEXT:  .Lcfi268:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -8454,22 +8174,18 @@ define zeroext i64 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi339:
+; NoVLX-NEXT:  .Lcfi269:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi340:
+; NoVLX-NEXT:  .Lcfi270:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi341:
+; NoVLX-NEXT:  .Lcfi271:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi342:
+; NoVLX-NEXT:  .Lcfi272:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi343:
+; NoVLX-NEXT:  .Lcfi273:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqd (%rsi){1to16}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -8512,11 +8228,11 @@ define zeroext i64 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -8528,11 +8244,15 @@ define zeroext i64 @test_masked_vpcmpeqd
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -8629,7 +8349,6 @@ define zeroext i4 @test_masked_vpcmpeqq_
 ;
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -8637,9 +8356,10 @@ define zeroext i4 @test_masked_vpcmpeqq_
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -8744,7 +8464,6 @@ define zeroext i4 @test_masked_vpcmpeqq_
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v4i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -8752,9 +8471,10 @@ define zeroext i4 @test_masked_vpcmpeqq_
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -9353,12 +9073,12 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v
 ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi344:
+; NoVLX-NEXT:  .Lcfi274:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi345:
+; NoVLX-NEXT:  .Lcfi275:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi346:
+; NoVLX-NEXT:  .Lcfi276:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -9396,12 +9116,12 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v
 ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi347:
+; NoVLX-NEXT:  .Lcfi277:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi348:
+; NoVLX-NEXT:  .Lcfi278:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi349:
+; NoVLX-NEXT:  .Lcfi279:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -9441,16 +9161,15 @@ define zeroext i32 @test_masked_vpcmpeqq
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi350:
+; NoVLX-NEXT:  .Lcfi280:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi351:
+; NoVLX-NEXT:  .Lcfi281:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi352:
+; NoVLX-NEXT:  .Lcfi282:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -9458,9 +9177,10 @@ define zeroext i32 @test_masked_vpcmpeqq
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -9498,12 +9218,12 @@ define zeroext i32 @test_masked_vpcmpeqq
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi353:
+; NoVLX-NEXT:  .Lcfi283:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi354:
+; NoVLX-NEXT:  .Lcfi284:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi355:
+; NoVLX-NEXT:  .Lcfi285:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -9556,12 +9276,12 @@ define zeroext i32 @test_vpcmpeqq_v2i1_v
 ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi356:
+; NoVLX-NEXT:  .Lcfi286:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi357:
+; NoVLX-NEXT:  .Lcfi287:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi358:
+; NoVLX-NEXT:  .Lcfi288:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -9603,17 +9323,16 @@ define zeroext i32 @test_masked_vpcmpeqq
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi359:
+; NoVLX-NEXT:  .Lcfi289:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi360:
+; NoVLX-NEXT:  .Lcfi290:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi361:
+; NoVLX-NEXT:  .Lcfi291:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -9621,9 +9340,10 @@ define zeroext i32 @test_masked_vpcmpeqq
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -9663,20 +9383,20 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v
 ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi362:
+; NoVLX-NEXT:  .Lcfi292:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi363:
+; NoVLX-NEXT:  .Lcfi293:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi364:
+; NoVLX-NEXT:  .Lcfi294:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -9712,20 +9432,20 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v
 ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi365:
+; NoVLX-NEXT:  .Lcfi295:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi366:
+; NoVLX-NEXT:  .Lcfi296:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi367:
+; NoVLX-NEXT:  .Lcfi297:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqq (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -9763,12 +9483,12 @@ define zeroext i64 @test_masked_vpcmpeqq
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi368:
+; NoVLX-NEXT:  .Lcfi298:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi369:
+; NoVLX-NEXT:  .Lcfi299:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi370:
+; NoVLX-NEXT:  .Lcfi300:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -9785,8 +9505,8 @@ define zeroext i64 @test_masked_vpcmpeqq
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -9826,12 +9546,12 @@ define zeroext i64 @test_masked_vpcmpeqq
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi371:
+; NoVLX-NEXT:  .Lcfi301:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi372:
+; NoVLX-NEXT:  .Lcfi302:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi373:
+; NoVLX-NEXT:  .Lcfi303:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -9848,8 +9568,8 @@ define zeroext i64 @test_masked_vpcmpeqq
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -9890,12 +9610,12 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v
 ; NoVLX-LABEL: test_vpcmpeqq_v2i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi374:
+; NoVLX-NEXT:  .Lcfi304:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi375:
+; NoVLX-NEXT:  .Lcfi305:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi376:
+; NoVLX-NEXT:  .Lcfi306:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -9903,8 +9623,8 @@ define zeroext i64 @test_vpcmpeqq_v2i1_v
 ; NoVLX-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -9943,12 +9663,12 @@ define zeroext i64 @test_masked_vpcmpeqq
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v2i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi377:
+; NoVLX-NEXT:  .Lcfi307:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi378:
+; NoVLX-NEXT:  .Lcfi308:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi379:
+; NoVLX-NEXT:  .Lcfi309:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -9966,8 +9686,8 @@ define zeroext i64 @test_masked_vpcmpeqq
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -10028,8 +9748,8 @@ define zeroext i8 @test_vpcmpeqq_v4i1_v8
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -10085,8 +9805,8 @@ define zeroext i8 @test_vpcmpeqq_v4i1_v8
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -10127,7 +9847,6 @@ define zeroext i8 @test_masked_vpcmpeqq_
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -10145,6 +9864,7 @@ define zeroext i8 @test_masked_vpcmpeqq_
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -10162,8 +9882,8 @@ define zeroext i8 @test_masked_vpcmpeqq_
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -10206,7 +9926,6 @@ define zeroext i8 @test_masked_vpcmpeqq_
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v8i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpcmpeqq (%rsi), %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -10224,6 +9943,7 @@ define zeroext i8 @test_masked_vpcmpeqq_
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -10241,8 +9961,8 @@ define zeroext i8 @test_masked_vpcmpeqq_
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -10304,8 +10024,8 @@ define zeroext i8 @test_vpcmpeqq_v4i1_v8
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -10348,7 +10068,6 @@ define zeroext i8 @test_masked_vpcmpeqq_
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
 ; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -10366,6 +10085,7 @@ define zeroext i8 @test_masked_vpcmpeqq_
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -10383,8 +10103,8 @@ define zeroext i8 @test_masked_vpcmpeqq_
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -10445,8 +10165,8 @@ define zeroext i16 @test_vpcmpeqq_v4i1_v
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -10501,8 +10221,8 @@ define zeroext i16 @test_vpcmpeqq_v4i1_v
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -10543,7 +10263,6 @@ define zeroext i16 @test_masked_vpcmpeqq
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -10561,6 +10280,7 @@ define zeroext i16 @test_masked_vpcmpeqq
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -10577,8 +10297,8 @@ define zeroext i16 @test_masked_vpcmpeqq
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -10621,7 +10341,6 @@ define zeroext i16 @test_masked_vpcmpeqq
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v16i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpcmpeqq (%rsi), %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -10639,6 +10358,7 @@ define zeroext i16 @test_masked_vpcmpeqq
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -10655,8 +10375,8 @@ define zeroext i16 @test_masked_vpcmpeqq
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -10717,8 +10437,8 @@ define zeroext i16 @test_vpcmpeqq_v4i1_v
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -10761,7 +10481,6 @@ define zeroext i16 @test_masked_vpcmpeqq
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
 ; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -10779,6 +10498,7 @@ define zeroext i16 @test_masked_vpcmpeqq
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -10795,8 +10515,8 @@ define zeroext i16 @test_masked_vpcmpeqq
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -10840,12 +10560,12 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v
 ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi380:
+; NoVLX-NEXT:  .Lcfi310:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi381:
+; NoVLX-NEXT:  .Lcfi311:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi382:
+; NoVLX-NEXT:  .Lcfi312:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -10885,12 +10605,12 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v
 ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi383:
+; NoVLX-NEXT:  .Lcfi313:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi384:
+; NoVLX-NEXT:  .Lcfi314:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi385:
+; NoVLX-NEXT:  .Lcfi315:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -10932,17 +10652,16 @@ define zeroext i32 @test_masked_vpcmpeqq
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi386:
+; NoVLX-NEXT:  .Lcfi316:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi387:
+; NoVLX-NEXT:  .Lcfi317:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi388:
+; NoVLX-NEXT:  .Lcfi318:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -10960,6 +10679,7 @@ define zeroext i32 @test_masked_vpcmpeqq
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -10999,17 +10719,16 @@ define zeroext i32 @test_masked_vpcmpeqq
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi389:
+; NoVLX-NEXT:  .Lcfi319:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi390:
+; NoVLX-NEXT:  .Lcfi320:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi391:
+; NoVLX-NEXT:  .Lcfi321:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpeqq (%rsi), %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -11027,6 +10746,7 @@ define zeroext i32 @test_masked_vpcmpeqq
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -11067,12 +10787,12 @@ define zeroext i32 @test_vpcmpeqq_v4i1_v
 ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi392:
+; NoVLX-NEXT:  .Lcfi322:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi393:
+; NoVLX-NEXT:  .Lcfi323:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi394:
+; NoVLX-NEXT:  .Lcfi324:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -11116,18 +10836,17 @@ define zeroext i32 @test_masked_vpcmpeqq
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi395:
+; NoVLX-NEXT:  .Lcfi325:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi396:
+; NoVLX-NEXT:  .Lcfi326:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi397:
+; NoVLX-NEXT:  .Lcfi327:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
 ; NoVLX-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -11145,6 +10864,7 @@ define zeroext i32 @test_masked_vpcmpeqq
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -11186,12 +10906,12 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v
 ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi398:
+; NoVLX-NEXT:  .Lcfi328:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi399:
+; NoVLX-NEXT:  .Lcfi329:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi400:
+; NoVLX-NEXT:  .Lcfi330:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -11199,8 +10919,8 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -11237,12 +10957,12 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v
 ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi401:
+; NoVLX-NEXT:  .Lcfi331:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi402:
+; NoVLX-NEXT:  .Lcfi332:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi403:
+; NoVLX-NEXT:  .Lcfi333:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -11250,8 +10970,8 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -11290,12 +11010,12 @@ define zeroext i64 @test_masked_vpcmpeqq
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi404:
+; NoVLX-NEXT:  .Lcfi334:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi405:
+; NoVLX-NEXT:  .Lcfi335:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi406:
+; NoVLX-NEXT:  .Lcfi336:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -11303,7 +11023,6 @@ define zeroext i64 @test_masked_vpcmpeqq
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -11316,13 +11035,14 @@ define zeroext i64 @test_masked_vpcmpeqq
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -11363,12 +11083,12 @@ define zeroext i64 @test_masked_vpcmpeqq
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi407:
+; NoVLX-NEXT:  .Lcfi337:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi408:
+; NoVLX-NEXT:  .Lcfi338:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi409:
+; NoVLX-NEXT:  .Lcfi339:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -11376,7 +11096,6 @@ define zeroext i64 @test_masked_vpcmpeqq
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -11389,13 +11108,14 @@ define zeroext i64 @test_masked_vpcmpeqq
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -11437,12 +11157,12 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v
 ; NoVLX-LABEL: test_vpcmpeqq_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi410:
+; NoVLX-NEXT:  .Lcfi340:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi411:
+; NoVLX-NEXT:  .Lcfi341:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi412:
+; NoVLX-NEXT:  .Lcfi342:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -11451,8 +11171,8 @@ define zeroext i64 @test_vpcmpeqq_v4i1_v
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -11492,12 +11212,12 @@ define zeroext i64 @test_masked_vpcmpeqq
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi413:
+; NoVLX-NEXT:  .Lcfi343:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi414:
+; NoVLX-NEXT:  .Lcfi344:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi415:
+; NoVLX-NEXT:  .Lcfi345:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -11506,7 +11226,6 @@ define zeroext i64 @test_masked_vpcmpeqq
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -11519,13 +11238,14 @@ define zeroext i64 @test_masked_vpcmpeqq
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -11732,12 +11452,12 @@ define zeroext i32 @test_vpcmpeqq_v8i1_v
 ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi416:
+; NoVLX-NEXT:  .Lcfi346:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi417:
+; NoVLX-NEXT:  .Lcfi347:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi418:
+; NoVLX-NEXT:  .Lcfi348:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -11805,12 +11525,12 @@ define zeroext i32 @test_vpcmpeqq_v8i1_v
 ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi419:
+; NoVLX-NEXT:  .Lcfi349:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi420:
+; NoVLX-NEXT:  .Lcfi350:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi421:
+; NoVLX-NEXT:  .Lcfi351:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -11880,12 +11600,12 @@ define zeroext i32 @test_masked_vpcmpeqq
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi422:
+; NoVLX-NEXT:  .Lcfi352:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi423:
+; NoVLX-NEXT:  .Lcfi353:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi424:
+; NoVLX-NEXT:  .Lcfi354:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -11957,12 +11677,12 @@ define zeroext i32 @test_masked_vpcmpeqq
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi425:
+; NoVLX-NEXT:  .Lcfi355:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi426:
+; NoVLX-NEXT:  .Lcfi356:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi427:
+; NoVLX-NEXT:  .Lcfi357:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -12035,12 +11755,12 @@ define zeroext i32 @test_vpcmpeqq_v8i1_v
 ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi428:
+; NoVLX-NEXT:  .Lcfi358:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi429:
+; NoVLX-NEXT:  .Lcfi359:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi430:
+; NoVLX-NEXT:  .Lcfi360:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -12111,12 +11831,12 @@ define zeroext i32 @test_masked_vpcmpeqq
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi431:
+; NoVLX-NEXT:  .Lcfi361:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi432:
+; NoVLX-NEXT:  .Lcfi362:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi433:
+; NoVLX-NEXT:  .Lcfi363:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -12190,53 +11910,53 @@ define zeroext i64 @test_vpcmpeqq_v8i1_v
 ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi434:
+; NoVLX-NEXT:  .Lcfi364:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi435:
+; NoVLX-NEXT:  .Lcfi365:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi436:
+; NoVLX-NEXT:  .Lcfi366:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -12268,53 +11988,53 @@ define zeroext i64 @test_vpcmpeqq_v8i1_v
 ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi437:
+; NoVLX-NEXT:  .Lcfi367:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi438:
+; NoVLX-NEXT:  .Lcfi368:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi439:
+; NoVLX-NEXT:  .Lcfi369:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqq (%rdi), %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -12348,54 +12068,54 @@ define zeroext i64 @test_masked_vpcmpeqq
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi440:
+; NoVLX-NEXT:  .Lcfi370:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi441:
+; NoVLX-NEXT:  .Lcfi371:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi442:
+; NoVLX-NEXT:  .Lcfi372:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -12430,54 +12150,54 @@ define zeroext i64 @test_masked_vpcmpeqq
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi443:
+; NoVLX-NEXT:  .Lcfi373:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi444:
+; NoVLX-NEXT:  .Lcfi374:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi445:
+; NoVLX-NEXT:  .Lcfi375:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -12513,53 +12233,53 @@ define zeroext i64 @test_vpcmpeqq_v8i1_v
 ; NoVLX-LABEL: test_vpcmpeqq_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi446:
+; NoVLX-NEXT:  .Lcfi376:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi447:
+; NoVLX-NEXT:  .Lcfi377:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi448:
+; NoVLX-NEXT:  .Lcfi378:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpeqq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -12594,54 +12314,54 @@ define zeroext i64 @test_masked_vpcmpeqq
 ; NoVLX-LABEL: test_masked_vpcmpeqq_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi449:
+; NoVLX-NEXT:  .Lcfi379:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi450:
+; NoVLX-NEXT:  .Lcfi380:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi451:
+; NoVLX-NEXT:  .Lcfi381:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpeqq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -12677,30 +12397,15 @@ define zeroext i32 @test_vpcmpsgtb_v16i1
 ; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi452:
+; NoVLX-NEXT:  .Lcfi382:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi453:
+; NoVLX-NEXT:  .Lcfi383:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi454:
+; NoVLX-NEXT:  .Lcfi384:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi455:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi456:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi457:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi458:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi459:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -12709,64 +12414,64 @@ define zeroext i32 @test_vpcmpsgtb_v16i1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -12775,12 +12480,7 @@ define zeroext i32 @test_vpcmpsgtb_v16i1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -12802,30 +12502,15 @@ define zeroext i32 @test_vpcmpsgtb_v16i1
 ; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi460:
+; NoVLX-NEXT:  .Lcfi385:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi461:
+; NoVLX-NEXT:  .Lcfi386:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi462:
+; NoVLX-NEXT:  .Lcfi387:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi463:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi464:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi465:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi466:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi467:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtb (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -12834,64 +12519,64 @@ define zeroext i32 @test_vpcmpsgtb_v16i1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -12900,12 +12585,7 @@ define zeroext i32 @test_vpcmpsgtb_v16i1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -12929,30 +12609,15 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi468:
+; NoVLX-NEXT:  .Lcfi388:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi469:
+; NoVLX-NEXT:  .Lcfi389:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi470:
+; NoVLX-NEXT:  .Lcfi390:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi471:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi472:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi473:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi474:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi475:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -12962,64 +12627,64 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -13028,12 +12693,7 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -13058,30 +12718,15 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi476:
+; NoVLX-NEXT:  .Lcfi391:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi477:
+; NoVLX-NEXT:  .Lcfi392:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi478:
+; NoVLX-NEXT:  .Lcfi393:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi479:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi480:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi481:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi482:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi483:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtb (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -13091,64 +12736,64 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -13157,12 +12802,7 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -13188,12 +12828,12 @@ define zeroext i64 @test_vpcmpsgtb_v16i1
 ; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi484:
+; NoVLX-NEXT:  .Lcfi394:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi485:
+; NoVLX-NEXT:  .Lcfi395:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi486:
+; NoVLX-NEXT:  .Lcfi396:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -13202,24 +12842,20 @@ define zeroext i64 @test_vpcmpsgtb_v16i1
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi487:
+; NoVLX-NEXT:  .Lcfi397:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi488:
+; NoVLX-NEXT:  .Lcfi398:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi489:
+; NoVLX-NEXT:  .Lcfi399:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi490:
+; NoVLX-NEXT:  .Lcfi400:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi491:
+; NoVLX-NEXT:  .Lcfi401:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -13262,11 +12898,11 @@ define zeroext i64 @test_vpcmpsgtb_v16i1
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -13278,11 +12914,15 @@ define zeroext i64 @test_vpcmpsgtb_v16i1
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -13318,12 +12958,12 @@ define zeroext i64 @test_vpcmpsgtb_v16i1
 ; NoVLX-LABEL: test_vpcmpsgtb_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi492:
+; NoVLX-NEXT:  .Lcfi402:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi493:
+; NoVLX-NEXT:  .Lcfi403:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi494:
+; NoVLX-NEXT:  .Lcfi404:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -13332,24 +12972,20 @@ define zeroext i64 @test_vpcmpsgtb_v16i1
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi495:
+; NoVLX-NEXT:  .Lcfi405:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi496:
+; NoVLX-NEXT:  .Lcfi406:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi497:
+; NoVLX-NEXT:  .Lcfi407:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi498:
+; NoVLX-NEXT:  .Lcfi408:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi499:
+; NoVLX-NEXT:  .Lcfi409:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtb (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -13392,11 +13028,11 @@ define zeroext i64 @test_vpcmpsgtb_v16i1
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -13408,11 +13044,15 @@ define zeroext i64 @test_vpcmpsgtb_v16i1
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -13450,12 +13090,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi500:
+; NoVLX-NEXT:  .Lcfi410:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi501:
+; NoVLX-NEXT:  .Lcfi411:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi502:
+; NoVLX-NEXT:  .Lcfi412:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -13464,25 +13104,21 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi503:
+; NoVLX-NEXT:  .Lcfi413:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi504:
+; NoVLX-NEXT:  .Lcfi414:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi505:
+; NoVLX-NEXT:  .Lcfi415:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi506:
+; NoVLX-NEXT:  .Lcfi416:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi507:
+; NoVLX-NEXT:  .Lcfi417:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -13525,11 +13161,11 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -13541,11 +13177,15 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -13584,12 +13224,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtb_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi508:
+; NoVLX-NEXT:  .Lcfi418:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi509:
+; NoVLX-NEXT:  .Lcfi419:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi510:
+; NoVLX-NEXT:  .Lcfi420:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -13598,25 +13238,21 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi511:
+; NoVLX-NEXT:  .Lcfi421:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi512:
+; NoVLX-NEXT:  .Lcfi422:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi513:
+; NoVLX-NEXT:  .Lcfi423:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi514:
+; NoVLX-NEXT:  .Lcfi424:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi515:
+; NoVLX-NEXT:  .Lcfi425:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtb (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -13659,11 +13295,11 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -13675,11 +13311,15 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -13720,12 +13360,12 @@ define zeroext i64 @test_vpcmpsgtb_v32i1
 ; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi516:
+; NoVLX-NEXT:  .Lcfi426:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi517:
+; NoVLX-NEXT:  .Lcfi427:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi518:
+; NoVLX-NEXT:  .Lcfi428:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -13769,12 +13409,12 @@ define zeroext i64 @test_vpcmpsgtb_v32i1
 ; NoVLX-LABEL: test_vpcmpsgtb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi519:
+; NoVLX-NEXT:  .Lcfi429:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi520:
+; NoVLX-NEXT:  .Lcfi430:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi521:
+; NoVLX-NEXT:  .Lcfi431:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -13820,12 +13460,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi522:
+; NoVLX-NEXT:  .Lcfi432:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi523:
+; NoVLX-NEXT:  .Lcfi433:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi524:
+; NoVLX-NEXT:  .Lcfi434:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -13881,12 +13521,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi525:
+; NoVLX-NEXT:  .Lcfi435:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi526:
+; NoVLX-NEXT:  .Lcfi436:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi527:
+; NoVLX-NEXT:  .Lcfi437:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -14065,12 +13705,12 @@ define zeroext i32 @test_vpcmpsgtw_v8i1_
 ; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi528:
+; NoVLX-NEXT:  .Lcfi438:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi529:
+; NoVLX-NEXT:  .Lcfi439:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi530:
+; NoVLX-NEXT:  .Lcfi440:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -14140,12 +13780,12 @@ define zeroext i32 @test_vpcmpsgtw_v8i1_
 ; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi531:
+; NoVLX-NEXT:  .Lcfi441:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi532:
+; NoVLX-NEXT:  .Lcfi442:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi533:
+; NoVLX-NEXT:  .Lcfi443:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -14217,12 +13857,12 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi534:
+; NoVLX-NEXT:  .Lcfi444:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi535:
+; NoVLX-NEXT:  .Lcfi445:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi536:
+; NoVLX-NEXT:  .Lcfi446:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -14296,12 +13936,12 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi537:
+; NoVLX-NEXT:  .Lcfi447:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi538:
+; NoVLX-NEXT:  .Lcfi448:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi539:
+; NoVLX-NEXT:  .Lcfi449:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -14376,12 +14016,12 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_
 ; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi540:
+; NoVLX-NEXT:  .Lcfi450:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi541:
+; NoVLX-NEXT:  .Lcfi451:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi542:
+; NoVLX-NEXT:  .Lcfi452:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -14389,43 +14029,43 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -14456,12 +14096,12 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_
 ; NoVLX-LABEL: test_vpcmpsgtw_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi543:
+; NoVLX-NEXT:  .Lcfi453:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi544:
+; NoVLX-NEXT:  .Lcfi454:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi545:
+; NoVLX-NEXT:  .Lcfi455:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -14469,43 +14109,43 @@ define zeroext i64 @test_vpcmpsgtw_v8i1_
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -14538,12 +14178,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi546:
+; NoVLX-NEXT:  .Lcfi456:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi547:
+; NoVLX-NEXT:  .Lcfi457:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi548:
+; NoVLX-NEXT:  .Lcfi458:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -14552,43 +14192,43 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -14622,12 +14262,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi549:
+; NoVLX-NEXT:  .Lcfi459:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi550:
+; NoVLX-NEXT:  .Lcfi460:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi551:
+; NoVLX-NEXT:  .Lcfi461:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -14636,43 +14276,43 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -14708,30 +14348,15 @@ define zeroext i32 @test_vpcmpsgtw_v16i1
 ; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi552:
+; NoVLX-NEXT:  .Lcfi462:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi553:
+; NoVLX-NEXT:  .Lcfi463:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi554:
+; NoVLX-NEXT:  .Lcfi464:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi555:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi556:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi557:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi558:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi559:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -14740,64 +14365,64 @@ define zeroext i32 @test_vpcmpsgtw_v16i1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -14806,12 +14431,7 @@ define zeroext i32 @test_vpcmpsgtw_v16i1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -14834,30 +14454,15 @@ define zeroext i32 @test_vpcmpsgtw_v16i1
 ; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi560:
+; NoVLX-NEXT:  .Lcfi465:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi561:
+; NoVLX-NEXT:  .Lcfi466:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi562:
+; NoVLX-NEXT:  .Lcfi467:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi563:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi564:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi565:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi566:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi567:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtw (%rdi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -14866,64 +14471,64 @@ define zeroext i32 @test_vpcmpsgtw_v16i1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -14932,12 +14537,7 @@ define zeroext i32 @test_vpcmpsgtw_v16i1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -14962,30 +14562,15 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi568:
+; NoVLX-NEXT:  .Lcfi468:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi569:
+; NoVLX-NEXT:  .Lcfi469:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi570:
+; NoVLX-NEXT:  .Lcfi470:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi571:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi572:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi573:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi574:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi575:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -14995,64 +14580,64 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -15061,12 +14646,7 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -15092,30 +14672,15 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi576:
+; NoVLX-NEXT:  .Lcfi471:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi577:
+; NoVLX-NEXT:  .Lcfi472:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi578:
+; NoVLX-NEXT:  .Lcfi473:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi579:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi580:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi581:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi582:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi583:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtw (%rsi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -15125,64 +14690,64 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -15191,12 +14756,7 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -15223,12 +14783,12 @@ define zeroext i64 @test_vpcmpsgtw_v16i1
 ; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi584:
+; NoVLX-NEXT:  .Lcfi474:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi585:
+; NoVLX-NEXT:  .Lcfi475:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi586:
+; NoVLX-NEXT:  .Lcfi476:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -15237,24 +14797,20 @@ define zeroext i64 @test_vpcmpsgtw_v16i1
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi587:
+; NoVLX-NEXT:  .Lcfi477:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi588:
+; NoVLX-NEXT:  .Lcfi478:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi589:
+; NoVLX-NEXT:  .Lcfi479:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi590:
+; NoVLX-NEXT:  .Lcfi480:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi591:
+; NoVLX-NEXT:  .Lcfi481:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -15297,11 +14853,11 @@ define zeroext i64 @test_vpcmpsgtw_v16i1
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -15313,11 +14869,15 @@ define zeroext i64 @test_vpcmpsgtw_v16i1
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -15354,12 +14914,12 @@ define zeroext i64 @test_vpcmpsgtw_v16i1
 ; NoVLX-LABEL: test_vpcmpsgtw_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi592:
+; NoVLX-NEXT:  .Lcfi482:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi593:
+; NoVLX-NEXT:  .Lcfi483:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi594:
+; NoVLX-NEXT:  .Lcfi484:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -15368,24 +14928,20 @@ define zeroext i64 @test_vpcmpsgtw_v16i1
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi595:
+; NoVLX-NEXT:  .Lcfi485:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi596:
+; NoVLX-NEXT:  .Lcfi486:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi597:
+; NoVLX-NEXT:  .Lcfi487:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi598:
+; NoVLX-NEXT:  .Lcfi488:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi599:
+; NoVLX-NEXT:  .Lcfi489:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtw (%rdi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -15428,11 +14984,11 @@ define zeroext i64 @test_vpcmpsgtw_v16i1
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -15444,11 +15000,15 @@ define zeroext i64 @test_vpcmpsgtw_v16i1
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -15487,12 +15047,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi600:
+; NoVLX-NEXT:  .Lcfi490:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi601:
+; NoVLX-NEXT:  .Lcfi491:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi602:
+; NoVLX-NEXT:  .Lcfi492:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -15501,25 +15061,21 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi603:
+; NoVLX-NEXT:  .Lcfi493:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi604:
+; NoVLX-NEXT:  .Lcfi494:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi605:
+; NoVLX-NEXT:  .Lcfi495:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi606:
+; NoVLX-NEXT:  .Lcfi496:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi607:
+; NoVLX-NEXT:  .Lcfi497:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -15562,11 +15118,11 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -15578,11 +15134,15 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -15622,12 +15182,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi608:
+; NoVLX-NEXT:  .Lcfi498:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi609:
+; NoVLX-NEXT:  .Lcfi499:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi610:
+; NoVLX-NEXT:  .Lcfi500:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -15636,25 +15196,21 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi611:
+; NoVLX-NEXT:  .Lcfi501:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi612:
+; NoVLX-NEXT:  .Lcfi502:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi613:
+; NoVLX-NEXT:  .Lcfi503:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi614:
+; NoVLX-NEXT:  .Lcfi504:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi615:
+; NoVLX-NEXT:  .Lcfi505:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtw (%rsi), %ymm0, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -15697,11 +15253,11 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -15713,11 +15269,15 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -15758,62 +15318,58 @@ define zeroext i64 @test_vpcmpsgtw_v32i1
 ; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi616:
+; NoVLX-NEXT:  .Lcfi506:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi617:
+; NoVLX-NEXT:  .Lcfi507:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi618:
+; NoVLX-NEXT:  .Lcfi508:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
-; NoVLX-NEXT:    vmovq %xmm3, %rax
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    vmovd %eax, %xmm3
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm5
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm8
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm6
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm7
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm9
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm2, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm5
@@ -15821,190 +15377,194 @@ define zeroext i64 @test_vpcmpsgtw_v32i1
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm4
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm7, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm6
 ; NoVLX-NEXT:    vmovq %xmm6, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm7
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm5
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    vmovq %xmm1, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm6
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm6
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT:    vmovq %xmm7, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm8
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm8, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm1, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm7
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm3
-; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vinserti128 $1, %xmm9, %ymm3, %ymm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm3
+; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm8, %ymm1
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm1
-; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vinserti128 $1, %xmm7, %ymm2, %ymm2
+; NoVLX-NEXT:    vpcmpgtw %ymm2, %ymm3, %ymm2
+; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
@@ -16109,69 +15669,68 @@ define zeroext i64 @test_vpcmpsgtw_v32i1
 ; NoVLX-LABEL: test_vpcmpsgtw_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi619:
+; NoVLX-NEXT:  .Lcfi509:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi620:
+; NoVLX-NEXT:  .Lcfi510:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi621:
+; NoVLX-NEXT:  .Lcfi511:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
-; NoVLX-NEXT:    vmovq %xmm2, %rax
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm1
+; NoVLX-NEXT:    vmovq %xmm1, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm1
+; NoVLX-NEXT:    vmovd %eax, %xmm2
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm3
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
+; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
@@ -16179,7 +15738,8 @@ define zeroext i64 @test_vpcmpsgtw_v32i1
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
@@ -16189,19 +15749,19 @@ define zeroext i64 @test_vpcmpsgtw_v32i1
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm4
+; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm0
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm1
 ; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtw 32(%rdi), %ymm1, %ymm1
 ; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
@@ -16377,12 +15937,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi622:
+; NoVLX-NEXT:  .Lcfi512:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi623:
+; NoVLX-NEXT:  .Lcfi513:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi624:
+; NoVLX-NEXT:  .Lcfi514:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -16393,17 +15953,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    movq %rax, %rdx
 ; NoVLX-NEXT:    vmovd %eax, %xmm3
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm4
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm8
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm6
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
 ; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
@@ -16411,9 +15966,10 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
 ; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm9
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm8
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm4
@@ -16431,39 +15987,40 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm6, %rcx
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm4
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm4
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm4, %xmm4
 ; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm6
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
@@ -16471,69 +16028,72 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm7, %rcx
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm5
+; NoVLX-NEXT:    vmovq %xmm5, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm6
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm6, %xmm6
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm6, %xmm6
+; NoVLX-NEXT:    vpextrq $1, %xmm5, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm6, %xmm5
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm5, %xmm5
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm5, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm6
+; NoVLX-NEXT:    vmovq %xmm6, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm7
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm7
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm7, %xmm7
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm7, %xmm7
+; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm7, %xmm6
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm6, %xmm6
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm6, %xmm6
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm8, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm6, %xmm6
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm7
+; NoVLX-NEXT:    vmovq %xmm7, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm6, %xmm6
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vmovq %xmm1, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm7
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm2
@@ -16542,18 +16102,13 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
-; NoVLX-NEXT:    vinserti128 $1, %xmm9, %ymm4, %ymm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm4
+; NoVLX-NEXT:    vinserti128 $1, %xmm8, %ymm3, %ymm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm3
 ; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vinserti128 $1, %xmm7, %ymm3, %ymm3
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT:    vpcmpgtw %ymm3, %ymm1, %ymm3
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
 ; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm6, %ymm4
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
@@ -16562,147 +16117,152 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm2, %ymm2
-; NoVLX-NEXT:    vpcmpgtw %ymm2, %ymm4, %ymm2
-; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT:    vpcmpgtw %ymm4, %ymm1, %ymm2
+; NoVLX-NEXT:    vinserti128 $1, %xmm7, %ymm5, %ymm1
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm1
+; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpmovsxwd %ymm3, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpand %xmm1, %xmm3, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -16740,12 +16300,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtw_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi625:
+; NoVLX-NEXT:  .Lcfi515:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi626:
+; NoVLX-NEXT:  .Lcfi516:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi627:
+; NoVLX-NEXT:  .Lcfi517:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -16757,8 +16317,6 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vmovd %eax, %xmm2
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
 ; NoVLX-NEXT:    shrq $32, %rdx
 ; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
@@ -16771,19 +16329,20 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
@@ -16791,6 +16350,7 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
 ; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
@@ -16813,174 +16373,174 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm5
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm3
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm2
-; NoVLX-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
-; NoVLX-NEXT:    vpcmpgtw (%rsi), %ymm2, %ymm2
-; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm4
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
+; NoVLX-NEXT:    vpcmpgtw (%rsi), %ymm3, %ymm3
+; NoVLX-NEXT:    vpmovsxwd %ymm3, %zmm3
+; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm3
+; NoVLX-NEXT:    vptestmd %zmm3, %zmm3, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    vmovd %eax, %xmm3
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm2
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpcmpgtw 32(%rsi), %ymm3, %ymm3
-; NoVLX-NEXT:    vpmovsxwd %ymm3, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm1
+; NoVLX-NEXT:    vpcmpgtw 32(%rsi), %ymm2, %ymm2
+; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpand %xmm1, %xmm3, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -17035,8 +16595,8 @@ define zeroext i8 @test_vpcmpsgtd_v4i1_v
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -17090,8 +16650,8 @@ define zeroext i8 @test_vpcmpsgtd_v4i1_v
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -17130,7 +16690,6 @@ define zeroext i8 @test_masked_vpcmpsgtd
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -17142,13 +16701,14 @@ define zeroext i8 @test_masked_vpcmpsgtd
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -17165,8 +16725,8 @@ define zeroext i8 @test_masked_vpcmpsgtd
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -17242,8 +16802,8 @@ define zeroext i8 @test_masked_vpcmpsgtd
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -17303,8 +16863,8 @@ define zeroext i8 @test_vpcmpsgtd_v4i1_v
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -17345,7 +16905,6 @@ define zeroext i8 @test_masked_vpcmpsgtd
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v8i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -17357,13 +16916,14 @@ define zeroext i8 @test_masked_vpcmpsgtd
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -17380,8 +16940,8 @@ define zeroext i8 @test_masked_vpcmpsgtd
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -17440,8 +17000,8 @@ define zeroext i16 @test_vpcmpsgtd_v4i1_
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -17494,8 +17054,8 @@ define zeroext i16 @test_vpcmpsgtd_v4i1_
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -17534,7 +17094,6 @@ define zeroext i16 @test_masked_vpcmpsgt
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -17546,13 +17105,14 @@ define zeroext i16 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -17568,8 +17128,8 @@ define zeroext i16 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -17644,8 +17204,8 @@ define zeroext i16 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -17704,8 +17264,8 @@ define zeroext i16 @test_vpcmpsgtd_v4i1_
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -17746,7 +17306,6 @@ define zeroext i16 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v16i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -17758,13 +17317,14 @@ define zeroext i16 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -17780,8 +17340,8 @@ define zeroext i16 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -17824,12 +17384,12 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_
 ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi628:
+; NoVLX-NEXT:  .Lcfi518:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi629:
+; NoVLX-NEXT:  .Lcfi519:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi630:
+; NoVLX-NEXT:  .Lcfi520:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -17867,12 +17427,12 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_
 ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi631:
+; NoVLX-NEXT:  .Lcfi521:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi632:
+; NoVLX-NEXT:  .Lcfi522:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi633:
+; NoVLX-NEXT:  .Lcfi523:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -17912,16 +17472,15 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi634:
+; NoVLX-NEXT:  .Lcfi524:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi635:
+; NoVLX-NEXT:  .Lcfi525:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi636:
+; NoVLX-NEXT:  .Lcfi526:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -17933,13 +17492,14 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -17977,12 +17537,12 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi637:
+; NoVLX-NEXT:  .Lcfi527:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi638:
+; NoVLX-NEXT:  .Lcfi528:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi639:
+; NoVLX-NEXT:  .Lcfi529:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -18043,12 +17603,12 @@ define zeroext i32 @test_vpcmpsgtd_v4i1_
 ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi640:
+; NoVLX-NEXT:  .Lcfi530:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi641:
+; NoVLX-NEXT:  .Lcfi531:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi642:
+; NoVLX-NEXT:  .Lcfi532:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -18090,17 +17650,16 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi643:
+; NoVLX-NEXT:  .Lcfi533:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi644:
+; NoVLX-NEXT:  .Lcfi534:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi645:
+; NoVLX-NEXT:  .Lcfi535:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -18112,13 +17671,14 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -18158,20 +17718,20 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_
 ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi646:
+; NoVLX-NEXT:  .Lcfi536:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi647:
+; NoVLX-NEXT:  .Lcfi537:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi648:
+; NoVLX-NEXT:  .Lcfi538:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -18207,20 +17767,20 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_
 ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi649:
+; NoVLX-NEXT:  .Lcfi539:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi650:
+; NoVLX-NEXT:  .Lcfi540:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi651:
+; NoVLX-NEXT:  .Lcfi541:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtd (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -18258,19 +17818,18 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi652:
+; NoVLX-NEXT:  .Lcfi542:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi653:
+; NoVLX-NEXT:  .Lcfi543:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi654:
+; NoVLX-NEXT:  .Lcfi544:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -18283,13 +17842,14 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -18329,19 +17889,18 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi655:
+; NoVLX-NEXT:  .Lcfi545:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi656:
+; NoVLX-NEXT:  .Lcfi546:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi657:
+; NoVLX-NEXT:  .Lcfi547:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtd (%rsi), %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -18354,13 +17913,14 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -18401,12 +17961,12 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_
 ; NoVLX-LABEL: test_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi658:
+; NoVLX-NEXT:  .Lcfi548:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi659:
+; NoVLX-NEXT:  .Lcfi549:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi660:
+; NoVLX-NEXT:  .Lcfi550:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -18414,8 +17974,8 @@ define zeroext i64 @test_vpcmpsgtd_v4i1_
 ; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -18454,12 +18014,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi661:
+; NoVLX-NEXT:  .Lcfi551:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi662:
+; NoVLX-NEXT:  .Lcfi552:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi663:
+; NoVLX-NEXT:  .Lcfi553:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -18467,7 +18027,6 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -18480,13 +18039,14 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -18717,12 +18277,12 @@ define zeroext i32 @test_vpcmpsgtd_v8i1_
 ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi664:
+; NoVLX-NEXT:  .Lcfi554:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi665:
+; NoVLX-NEXT:  .Lcfi555:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi666:
+; NoVLX-NEXT:  .Lcfi556:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -18792,12 +18352,12 @@ define zeroext i32 @test_vpcmpsgtd_v8i1_
 ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi667:
+; NoVLX-NEXT:  .Lcfi557:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi668:
+; NoVLX-NEXT:  .Lcfi558:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi669:
+; NoVLX-NEXT:  .Lcfi559:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -18869,12 +18429,12 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi670:
+; NoVLX-NEXT:  .Lcfi560:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi671:
+; NoVLX-NEXT:  .Lcfi561:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi672:
+; NoVLX-NEXT:  .Lcfi562:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -18949,12 +18509,12 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi673:
+; NoVLX-NEXT:  .Lcfi563:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi674:
+; NoVLX-NEXT:  .Lcfi564:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi675:
+; NoVLX-NEXT:  .Lcfi565:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -19030,12 +18590,12 @@ define zeroext i32 @test_vpcmpsgtd_v8i1_
 ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi676:
+; NoVLX-NEXT:  .Lcfi566:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi677:
+; NoVLX-NEXT:  .Lcfi567:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi678:
+; NoVLX-NEXT:  .Lcfi568:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -19108,12 +18668,12 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi679:
+; NoVLX-NEXT:  .Lcfi569:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi680:
+; NoVLX-NEXT:  .Lcfi570:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi681:
+; NoVLX-NEXT:  .Lcfi571:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -19190,55 +18750,55 @@ define zeroext i64 @test_vpcmpsgtd_v8i1_
 ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi682:
+; NoVLX-NEXT:  .Lcfi572:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi683:
+; NoVLX-NEXT:  .Lcfi573:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi684:
+; NoVLX-NEXT:  .Lcfi574:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
 ; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -19270,55 +18830,55 @@ define zeroext i64 @test_vpcmpsgtd_v8i1_
 ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi685:
+; NoVLX-NEXT:  .Lcfi575:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi686:
+; NoVLX-NEXT:  .Lcfi576:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi687:
+; NoVLX-NEXT:  .Lcfi577:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -19352,12 +18912,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi688:
+; NoVLX-NEXT:  .Lcfi578:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi689:
+; NoVLX-NEXT:  .Lcfi579:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi690:
+; NoVLX-NEXT:  .Lcfi580:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -19366,43 +18926,43 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    kandw %k1, %k0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -19437,12 +18997,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi691:
+; NoVLX-NEXT:  .Lcfi581:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi692:
+; NoVLX-NEXT:  .Lcfi582:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi693:
+; NoVLX-NEXT:  .Lcfi583:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -19451,43 +19011,43 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    kandw %k1, %k0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -19523,55 +19083,55 @@ define zeroext i64 @test_vpcmpsgtd_v8i1_
 ; NoVLX-LABEL: test_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi694:
+; NoVLX-NEXT:  .Lcfi584:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi695:
+; NoVLX-NEXT:  .Lcfi585:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi696:
+; NoVLX-NEXT:  .Lcfi586:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -19606,12 +19166,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi697:
+; NoVLX-NEXT:  .Lcfi587:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi698:
+; NoVLX-NEXT:  .Lcfi588:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi699:
+; NoVLX-NEXT:  .Lcfi589:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -19620,43 +19180,43 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    kandw %k0, %k1, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -19693,93 +19253,78 @@ define zeroext i32 @test_vpcmpsgtd_v16i1
 ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi700:
+; NoVLX-NEXT:  .Lcfi590:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi701:
+; NoVLX-NEXT:  .Lcfi591:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi702:
+; NoVLX-NEXT:  .Lcfi592:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi703:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi704:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi705:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi706:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi707:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -19788,12 +19333,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -19816,93 +19356,78 @@ define zeroext i32 @test_vpcmpsgtd_v16i1
 ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi708:
+; NoVLX-NEXT:  .Lcfi593:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi709:
+; NoVLX-NEXT:  .Lcfi594:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi710:
+; NoVLX-NEXT:  .Lcfi595:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi711:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi712:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi713:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi714:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi715:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtd (%rdi), %zmm0, %k0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -19911,12 +19436,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -19941,94 +19461,79 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi716:
+; NoVLX-NEXT:  .Lcfi596:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi717:
+; NoVLX-NEXT:  .Lcfi597:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi718:
+; NoVLX-NEXT:  .Lcfi598:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi719:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi720:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi721:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi722:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi723:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -20037,12 +19542,7 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -20068,94 +19568,79 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi724:
+; NoVLX-NEXT:  .Lcfi599:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi725:
+; NoVLX-NEXT:  .Lcfi600:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi726:
+; NoVLX-NEXT:  .Lcfi601:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi727:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi728:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi729:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi730:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi731:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -20164,12 +19649,7 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -20196,93 +19676,78 @@ define zeroext i32 @test_vpcmpsgtd_v16i1
 ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi732:
+; NoVLX-NEXT:  .Lcfi602:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi733:
+; NoVLX-NEXT:  .Lcfi603:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi734:
+; NoVLX-NEXT:  .Lcfi604:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi735:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi736:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi737:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi738:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi739:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtd (%rdi){1to16}, %zmm0, %k0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -20291,12 +19756,7 @@ define zeroext i32 @test_vpcmpsgtd_v16i1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -20322,94 +19782,79 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi740:
+; NoVLX-NEXT:  .Lcfi605:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi741:
+; NoVLX-NEXT:  .Lcfi606:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi742:
+; NoVLX-NEXT:  .Lcfi607:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi743:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi744:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi745:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi746:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi747:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -20418,12 +19863,7 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -20451,12 +19891,12 @@ define zeroext i64 @test_vpcmpsgtd_v16i1
 ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi748:
+; NoVLX-NEXT:  .Lcfi608:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi749:
+; NoVLX-NEXT:  .Lcfi609:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi750:
+; NoVLX-NEXT:  .Lcfi610:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -20465,21 +19905,17 @@ define zeroext i64 @test_vpcmpsgtd_v16i1
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi751:
+; NoVLX-NEXT:  .Lcfi611:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi752:
+; NoVLX-NEXT:  .Lcfi612:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi753:
+; NoVLX-NEXT:  .Lcfi613:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi754:
+; NoVLX-NEXT:  .Lcfi614:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi755:
+; NoVLX-NEXT:  .Lcfi615:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -20522,11 +19958,11 @@ define zeroext i64 @test_vpcmpsgtd_v16i1
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -20538,11 +19974,15 @@ define zeroext i64 @test_vpcmpsgtd_v16i1
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -20579,12 +20019,12 @@ define zeroext i64 @test_vpcmpsgtd_v16i1
 ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi756:
+; NoVLX-NEXT:  .Lcfi616:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi757:
+; NoVLX-NEXT:  .Lcfi617:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi758:
+; NoVLX-NEXT:  .Lcfi618:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -20593,21 +20033,17 @@ define zeroext i64 @test_vpcmpsgtd_v16i1
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi759:
+; NoVLX-NEXT:  .Lcfi619:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi760:
+; NoVLX-NEXT:  .Lcfi620:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi761:
+; NoVLX-NEXT:  .Lcfi621:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi762:
+; NoVLX-NEXT:  .Lcfi622:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi763:
+; NoVLX-NEXT:  .Lcfi623:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtd (%rdi), %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -20650,11 +20086,11 @@ define zeroext i64 @test_vpcmpsgtd_v16i1
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -20666,11 +20102,15 @@ define zeroext i64 @test_vpcmpsgtd_v16i1
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -20709,12 +20149,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi764:
+; NoVLX-NEXT:  .Lcfi624:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi765:
+; NoVLX-NEXT:  .Lcfi625:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi766:
+; NoVLX-NEXT:  .Lcfi626:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -20723,22 +20163,18 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi767:
+; NoVLX-NEXT:  .Lcfi627:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi768:
+; NoVLX-NEXT:  .Lcfi628:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi769:
+; NoVLX-NEXT:  .Lcfi629:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi770:
+; NoVLX-NEXT:  .Lcfi630:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi771:
+; NoVLX-NEXT:  .Lcfi631:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -20781,11 +20217,11 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -20797,11 +20233,15 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -20841,12 +20281,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi772:
+; NoVLX-NEXT:  .Lcfi632:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi773:
+; NoVLX-NEXT:  .Lcfi633:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi774:
+; NoVLX-NEXT:  .Lcfi634:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -20855,22 +20295,18 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi775:
+; NoVLX-NEXT:  .Lcfi635:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi776:
+; NoVLX-NEXT:  .Lcfi636:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi777:
+; NoVLX-NEXT:  .Lcfi637:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi778:
+; NoVLX-NEXT:  .Lcfi638:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi779:
+; NoVLX-NEXT:  .Lcfi639:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtd (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -20913,11 +20349,11 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -20929,11 +20365,15 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -20974,12 +20414,12 @@ define zeroext i64 @test_vpcmpsgtd_v16i1
 ; NoVLX-LABEL: test_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi780:
+; NoVLX-NEXT:  .Lcfi640:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi781:
+; NoVLX-NEXT:  .Lcfi641:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi782:
+; NoVLX-NEXT:  .Lcfi642:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -20988,21 +20428,17 @@ define zeroext i64 @test_vpcmpsgtd_v16i1
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi783:
+; NoVLX-NEXT:  .Lcfi643:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi784:
+; NoVLX-NEXT:  .Lcfi644:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi785:
+; NoVLX-NEXT:  .Lcfi645:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi786:
+; NoVLX-NEXT:  .Lcfi646:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi787:
+; NoVLX-NEXT:  .Lcfi647:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtd (%rdi){1to16}, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -21045,11 +20481,11 @@ define zeroext i64 @test_vpcmpsgtd_v16i1
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -21061,11 +20497,15 @@ define zeroext i64 @test_vpcmpsgtd_v16i1
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -21105,12 +20545,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtd_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi788:
+; NoVLX-NEXT:  .Lcfi648:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi789:
+; NoVLX-NEXT:  .Lcfi649:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi790:
+; NoVLX-NEXT:  .Lcfi650:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -21119,22 +20559,18 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi791:
+; NoVLX-NEXT:  .Lcfi651:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi792:
+; NoVLX-NEXT:  .Lcfi652:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi793:
+; NoVLX-NEXT:  .Lcfi653:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi794:
+; NoVLX-NEXT:  .Lcfi654:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi795:
+; NoVLX-NEXT:  .Lcfi655:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtd (%rsi){1to16}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -21177,11 +20613,11 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -21193,11 +20629,15 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -21294,7 +20734,6 @@ define zeroext i4 @test_masked_vpcmpsgtq
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -21302,9 +20741,10 @@ define zeroext i4 @test_masked_vpcmpsgtq
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -21409,7 +20849,6 @@ define zeroext i4 @test_masked_vpcmpsgtq
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v4i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -21417,9 +20856,10 @@ define zeroext i4 @test_masked_vpcmpsgtq
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -22018,12 +21458,12 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_
 ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi796:
+; NoVLX-NEXT:  .Lcfi656:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi797:
+; NoVLX-NEXT:  .Lcfi657:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi798:
+; NoVLX-NEXT:  .Lcfi658:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -22061,12 +21501,12 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_
 ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi799:
+; NoVLX-NEXT:  .Lcfi659:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi800:
+; NoVLX-NEXT:  .Lcfi660:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi801:
+; NoVLX-NEXT:  .Lcfi661:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -22106,16 +21546,15 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi802:
+; NoVLX-NEXT:  .Lcfi662:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi803:
+; NoVLX-NEXT:  .Lcfi663:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi804:
+; NoVLX-NEXT:  .Lcfi664:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -22123,9 +21562,10 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -22163,12 +21603,12 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi805:
+; NoVLX-NEXT:  .Lcfi665:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi806:
+; NoVLX-NEXT:  .Lcfi666:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi807:
+; NoVLX-NEXT:  .Lcfi667:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -22221,12 +21661,12 @@ define zeroext i32 @test_vpcmpsgtq_v2i1_
 ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi808:
+; NoVLX-NEXT:  .Lcfi668:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi809:
+; NoVLX-NEXT:  .Lcfi669:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi810:
+; NoVLX-NEXT:  .Lcfi670:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -22268,17 +21708,16 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi811:
+; NoVLX-NEXT:  .Lcfi671:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi812:
+; NoVLX-NEXT:  .Lcfi672:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi813:
+; NoVLX-NEXT:  .Lcfi673:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -22286,9 +21725,10 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -22328,20 +21768,20 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_
 ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi814:
+; NoVLX-NEXT:  .Lcfi674:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi815:
+; NoVLX-NEXT:  .Lcfi675:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi816:
+; NoVLX-NEXT:  .Lcfi676:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -22377,20 +21817,20 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_
 ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi817:
+; NoVLX-NEXT:  .Lcfi677:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi818:
+; NoVLX-NEXT:  .Lcfi678:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi819:
+; NoVLX-NEXT:  .Lcfi679:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtq (%rdi), %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -22428,12 +21868,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi820:
+; NoVLX-NEXT:  .Lcfi680:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi821:
+; NoVLX-NEXT:  .Lcfi681:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi822:
+; NoVLX-NEXT:  .Lcfi682:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -22450,8 +21890,8 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -22491,12 +21931,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi823:
+; NoVLX-NEXT:  .Lcfi683:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi824:
+; NoVLX-NEXT:  .Lcfi684:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi825:
+; NoVLX-NEXT:  .Lcfi685:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -22513,8 +21953,8 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -22555,12 +21995,12 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_
 ; NoVLX-LABEL: test_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi826:
+; NoVLX-NEXT:  .Lcfi686:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi827:
+; NoVLX-NEXT:  .Lcfi687:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi828:
+; NoVLX-NEXT:  .Lcfi688:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -22568,8 +22008,8 @@ define zeroext i64 @test_vpcmpsgtq_v2i1_
 ; NoVLX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -22608,12 +22048,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v2i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi829:
+; NoVLX-NEXT:  .Lcfi689:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi830:
+; NoVLX-NEXT:  .Lcfi690:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi831:
+; NoVLX-NEXT:  .Lcfi691:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -22631,8 +22071,8 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -22693,8 +22133,8 @@ define zeroext i8 @test_vpcmpsgtq_v4i1_v
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -22750,8 +22190,8 @@ define zeroext i8 @test_vpcmpsgtq_v4i1_v
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -22792,7 +22232,6 @@ define zeroext i8 @test_masked_vpcmpsgtq
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -22810,6 +22249,7 @@ define zeroext i8 @test_masked_vpcmpsgtq
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -22827,8 +22267,8 @@ define zeroext i8 @test_masked_vpcmpsgtq
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -22871,7 +22311,6 @@ define zeroext i8 @test_masked_vpcmpsgtq
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v8i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpcmpgtq (%rsi), %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -22889,6 +22328,7 @@ define zeroext i8 @test_masked_vpcmpsgtq
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -22906,8 +22346,8 @@ define zeroext i8 @test_masked_vpcmpsgtq
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -22969,8 +22409,8 @@ define zeroext i8 @test_vpcmpsgtq_v4i1_v
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -23013,7 +22453,6 @@ define zeroext i8 @test_masked_vpcmpsgtq
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -23031,6 +22470,7 @@ define zeroext i8 @test_masked_vpcmpsgtq
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -23048,8 +22488,8 @@ define zeroext i8 @test_masked_vpcmpsgtq
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -23110,8 +22550,8 @@ define zeroext i16 @test_vpcmpsgtq_v4i1_
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -23166,8 +22606,8 @@ define zeroext i16 @test_vpcmpsgtq_v4i1_
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -23208,7 +22648,6 @@ define zeroext i16 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -23226,6 +22665,7 @@ define zeroext i16 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -23242,8 +22682,8 @@ define zeroext i16 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -23286,7 +22726,6 @@ define zeroext i16 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v16i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpcmpgtq (%rsi), %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -23304,6 +22743,7 @@ define zeroext i16 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -23320,8 +22760,8 @@ define zeroext i16 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -23382,8 +22822,8 @@ define zeroext i16 @test_vpcmpsgtq_v4i1_
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -23426,7 +22866,6 @@ define zeroext i16 @test_masked_vpcmpsgt
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -23444,6 +22883,7 @@ define zeroext i16 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -23460,8 +22900,8 @@ define zeroext i16 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -23505,12 +22945,12 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_
 ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi832:
+; NoVLX-NEXT:  .Lcfi692:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi833:
+; NoVLX-NEXT:  .Lcfi693:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi834:
+; NoVLX-NEXT:  .Lcfi694:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -23550,12 +22990,12 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_
 ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi835:
+; NoVLX-NEXT:  .Lcfi695:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi836:
+; NoVLX-NEXT:  .Lcfi696:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi837:
+; NoVLX-NEXT:  .Lcfi697:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -23597,17 +23037,16 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi838:
+; NoVLX-NEXT:  .Lcfi698:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi839:
+; NoVLX-NEXT:  .Lcfi699:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi840:
+; NoVLX-NEXT:  .Lcfi700:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -23625,6 +23064,7 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -23664,17 +23104,16 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi841:
+; NoVLX-NEXT:  .Lcfi701:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi842:
+; NoVLX-NEXT:  .Lcfi702:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi843:
+; NoVLX-NEXT:  .Lcfi703:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpgtq (%rsi), %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -23692,6 +23131,7 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -23732,12 +23172,12 @@ define zeroext i32 @test_vpcmpsgtq_v4i1_
 ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi844:
+; NoVLX-NEXT:  .Lcfi704:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi845:
+; NoVLX-NEXT:  .Lcfi705:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi846:
+; NoVLX-NEXT:  .Lcfi706:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -23781,18 +23221,17 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi847:
+; NoVLX-NEXT:  .Lcfi707:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi848:
+; NoVLX-NEXT:  .Lcfi708:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi849:
+; NoVLX-NEXT:  .Lcfi709:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -23810,6 +23249,7 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -23851,12 +23291,12 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_
 ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi850:
+; NoVLX-NEXT:  .Lcfi710:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi851:
+; NoVLX-NEXT:  .Lcfi711:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi852:
+; NoVLX-NEXT:  .Lcfi712:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -23864,8 +23304,8 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -23902,12 +23342,12 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_
 ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi853:
+; NoVLX-NEXT:  .Lcfi713:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi854:
+; NoVLX-NEXT:  .Lcfi714:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi855:
+; NoVLX-NEXT:  .Lcfi715:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -23915,8 +23355,8 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -23955,12 +23395,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi856:
+; NoVLX-NEXT:  .Lcfi716:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi857:
+; NoVLX-NEXT:  .Lcfi717:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi858:
+; NoVLX-NEXT:  .Lcfi718:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -23968,7 +23408,6 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -23981,13 +23420,14 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -24028,12 +23468,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi859:
+; NoVLX-NEXT:  .Lcfi719:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi860:
+; NoVLX-NEXT:  .Lcfi720:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi861:
+; NoVLX-NEXT:  .Lcfi721:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -24041,7 +23481,6 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -24054,13 +23493,14 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -24102,12 +23542,12 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_
 ; NoVLX-LABEL: test_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi862:
+; NoVLX-NEXT:  .Lcfi722:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi863:
+; NoVLX-NEXT:  .Lcfi723:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi864:
+; NoVLX-NEXT:  .Lcfi724:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -24116,8 +23556,8 @@ define zeroext i64 @test_vpcmpsgtq_v4i1_
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -24157,12 +23597,12 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi865:
+; NoVLX-NEXT:  .Lcfi725:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi866:
+; NoVLX-NEXT:  .Lcfi726:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi867:
+; NoVLX-NEXT:  .Lcfi727:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -24171,7 +23611,6 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -24184,13 +23623,14 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -24397,12 +23837,12 @@ define zeroext i32 @test_vpcmpsgtq_v8i1_
 ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi868:
+; NoVLX-NEXT:  .Lcfi728:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi869:
+; NoVLX-NEXT:  .Lcfi729:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi870:
+; NoVLX-NEXT:  .Lcfi730:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -24470,12 +23910,12 @@ define zeroext i32 @test_vpcmpsgtq_v8i1_
 ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi871:
+; NoVLX-NEXT:  .Lcfi731:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi872:
+; NoVLX-NEXT:  .Lcfi732:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi873:
+; NoVLX-NEXT:  .Lcfi733:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -24545,12 +23985,12 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi874:
+; NoVLX-NEXT:  .Lcfi734:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi875:
+; NoVLX-NEXT:  .Lcfi735:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi876:
+; NoVLX-NEXT:  .Lcfi736:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -24622,12 +24062,12 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi877:
+; NoVLX-NEXT:  .Lcfi737:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi878:
+; NoVLX-NEXT:  .Lcfi738:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi879:
+; NoVLX-NEXT:  .Lcfi739:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -24700,12 +24140,12 @@ define zeroext i32 @test_vpcmpsgtq_v8i1_
 ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi880:
+; NoVLX-NEXT:  .Lcfi740:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi881:
+; NoVLX-NEXT:  .Lcfi741:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi882:
+; NoVLX-NEXT:  .Lcfi742:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -24776,12 +24216,12 @@ define zeroext i32 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi883:
+; NoVLX-NEXT:  .Lcfi743:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi884:
+; NoVLX-NEXT:  .Lcfi744:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi885:
+; NoVLX-NEXT:  .Lcfi745:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -24855,53 +24295,53 @@ define zeroext i64 @test_vpcmpsgtq_v8i1_
 ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi886:
+; NoVLX-NEXT:  .Lcfi746:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi887:
+; NoVLX-NEXT:  .Lcfi747:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi888:
+; NoVLX-NEXT:  .Lcfi748:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -24933,53 +24373,53 @@ define zeroext i64 @test_vpcmpsgtq_v8i1_
 ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi889:
+; NoVLX-NEXT:  .Lcfi749:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi890:
+; NoVLX-NEXT:  .Lcfi750:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi891:
+; NoVLX-NEXT:  .Lcfi751:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtq (%rdi), %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -25013,54 +24453,54 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi892:
+; NoVLX-NEXT:  .Lcfi752:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi893:
+; NoVLX-NEXT:  .Lcfi753:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi894:
+; NoVLX-NEXT:  .Lcfi754:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -25095,54 +24535,54 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi895:
+; NoVLX-NEXT:  .Lcfi755:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi896:
+; NoVLX-NEXT:  .Lcfi756:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi897:
+; NoVLX-NEXT:  .Lcfi757:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -25178,53 +24618,53 @@ define zeroext i64 @test_vpcmpsgtq_v8i1_
 ; NoVLX-LABEL: test_vpcmpsgtq_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi898:
+; NoVLX-NEXT:  .Lcfi758:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi899:
+; NoVLX-NEXT:  .Lcfi759:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi900:
+; NoVLX-NEXT:  .Lcfi760:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtq (%rdi){1to8}, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -25259,54 +24699,54 @@ define zeroext i64 @test_masked_vpcmpsgt
 ; NoVLX-LABEL: test_masked_vpcmpsgtq_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi901:
+; NoVLX-NEXT:  .Lcfi761:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi902:
+; NoVLX-NEXT:  .Lcfi762:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi903:
+; NoVLX-NEXT:  .Lcfi763:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpgtq (%rsi){1to8}, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -25342,30 +24782,15 @@ define zeroext i32 @test_vpcmpsgeb_v16i1
 ; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi904:
+; NoVLX-NEXT:  .Lcfi764:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi905:
+; NoVLX-NEXT:  .Lcfi765:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi906:
+; NoVLX-NEXT:  .Lcfi766:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi907:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi908:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi909:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi910:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi911:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
@@ -25376,64 +24801,64 @@ define zeroext i32 @test_vpcmpsgeb_v16i1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -25442,12 +24867,7 @@ define zeroext i32 @test_vpcmpsgeb_v16i1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -25469,30 +24889,15 @@ define zeroext i32 @test_vpcmpsgeb_v16i1
 ; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi912:
+; NoVLX-NEXT:  .Lcfi767:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi913:
+; NoVLX-NEXT:  .Lcfi768:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi914:
+; NoVLX-NEXT:  .Lcfi769:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi915:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi916:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi917:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi918:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi919:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
@@ -25504,64 +24909,64 @@ define zeroext i32 @test_vpcmpsgeb_v16i1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -25570,12 +24975,7 @@ define zeroext i32 @test_vpcmpsgeb_v16i1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -25599,30 +24999,15 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi920:
+; NoVLX-NEXT:  .Lcfi770:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi921:
+; NoVLX-NEXT:  .Lcfi771:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi922:
+; NoVLX-NEXT:  .Lcfi772:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi923:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi924:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi925:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi926:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi927:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
@@ -25634,64 +25019,64 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -25700,12 +25085,7 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -25730,30 +25110,15 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi928:
+; NoVLX-NEXT:  .Lcfi773:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi929:
+; NoVLX-NEXT:  .Lcfi774:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi930:
+; NoVLX-NEXT:  .Lcfi775:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi931:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi932:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi933:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi934:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi935:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
@@ -25766,64 +25131,64 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -25832,12 +25197,7 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -25863,12 +25223,12 @@ define zeroext i64 @test_vpcmpsgeb_v16i1
 ; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi936:
+; NoVLX-NEXT:  .Lcfi776:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi937:
+; NoVLX-NEXT:  .Lcfi777:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi938:
+; NoVLX-NEXT:  .Lcfi778:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -25877,15 +25237,15 @@ define zeroext i64 @test_vpcmpsgeb_v16i1
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi939:
+; NoVLX-NEXT:  .Lcfi779:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi940:
+; NoVLX-NEXT:  .Lcfi780:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi941:
+; NoVLX-NEXT:  .Lcfi781:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi942:
+; NoVLX-NEXT:  .Lcfi782:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi943:
+; NoVLX-NEXT:  .Lcfi783:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
@@ -25893,10 +25253,6 @@ define zeroext i64 @test_vpcmpsgeb_v16i1
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -25939,11 +25295,11 @@ define zeroext i64 @test_vpcmpsgeb_v16i1
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -25955,11 +25311,15 @@ define zeroext i64 @test_vpcmpsgeb_v16i1
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -25995,12 +25355,12 @@ define zeroext i64 @test_vpcmpsgeb_v16i1
 ; NoVLX-LABEL: test_vpcmpsgeb_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi944:
+; NoVLX-NEXT:  .Lcfi784:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi945:
+; NoVLX-NEXT:  .Lcfi785:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi946:
+; NoVLX-NEXT:  .Lcfi786:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -26009,15 +25369,15 @@ define zeroext i64 @test_vpcmpsgeb_v16i1
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi947:
+; NoVLX-NEXT:  .Lcfi787:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi948:
+; NoVLX-NEXT:  .Lcfi788:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi949:
+; NoVLX-NEXT:  .Lcfi789:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi950:
+; NoVLX-NEXT:  .Lcfi790:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi951:
+; NoVLX-NEXT:  .Lcfi791:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa (%rdi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
@@ -26026,10 +25386,6 @@ define zeroext i64 @test_vpcmpsgeb_v16i1
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -26072,11 +25428,11 @@ define zeroext i64 @test_vpcmpsgeb_v16i1
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -26088,11 +25444,15 @@ define zeroext i64 @test_vpcmpsgeb_v16i1
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -26130,12 +25490,12 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi952:
+; NoVLX-NEXT:  .Lcfi792:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi953:
+; NoVLX-NEXT:  .Lcfi793:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi954:
+; NoVLX-NEXT:  .Lcfi794:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -26144,15 +25504,15 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi955:
+; NoVLX-NEXT:  .Lcfi795:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi956:
+; NoVLX-NEXT:  .Lcfi796:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi957:
+; NoVLX-NEXT:  .Lcfi797:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi958:
+; NoVLX-NEXT:  .Lcfi798:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi959:
+; NoVLX-NEXT:  .Lcfi799:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
@@ -26161,10 +25521,6 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -26207,11 +25563,11 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -26223,11 +25579,15 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -26266,12 +25626,12 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgeb_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi960:
+; NoVLX-NEXT:  .Lcfi800:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi961:
+; NoVLX-NEXT:  .Lcfi801:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi962:
+; NoVLX-NEXT:  .Lcfi802:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -26280,15 +25640,15 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi963:
+; NoVLX-NEXT:  .Lcfi803:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi964:
+; NoVLX-NEXT:  .Lcfi804:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi965:
+; NoVLX-NEXT:  .Lcfi805:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi966:
+; NoVLX-NEXT:  .Lcfi806:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi967:
+; NoVLX-NEXT:  .Lcfi807:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
 ; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
@@ -26298,10 +25658,6 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -26344,11 +25700,11 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -26360,11 +25716,15 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -26405,12 +25765,12 @@ define zeroext i64 @test_vpcmpsgeb_v32i1
 ; NoVLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi968:
+; NoVLX-NEXT:  .Lcfi808:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi969:
+; NoVLX-NEXT:  .Lcfi809:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi970:
+; NoVLX-NEXT:  .Lcfi810:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -26456,12 +25816,12 @@ define zeroext i64 @test_vpcmpsgeb_v32i1
 ; NoVLX-LABEL: test_vpcmpsgeb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi971:
+; NoVLX-NEXT:  .Lcfi811:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi972:
+; NoVLX-NEXT:  .Lcfi812:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi973:
+; NoVLX-NEXT:  .Lcfi813:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -26510,12 +25870,12 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi974:
+; NoVLX-NEXT:  .Lcfi814:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi975:
+; NoVLX-NEXT:  .Lcfi815:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi976:
+; NoVLX-NEXT:  .Lcfi816:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -26573,12 +25933,12 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgeb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi977:
+; NoVLX-NEXT:  .Lcfi817:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi978:
+; NoVLX-NEXT:  .Lcfi818:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi979:
+; NoVLX-NEXT:  .Lcfi819:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -26770,12 +26130,12 @@ define zeroext i32 @test_vpcmpsgew_v8i1_
 ; NoVLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi980:
+; NoVLX-NEXT:  .Lcfi820:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi981:
+; NoVLX-NEXT:  .Lcfi821:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi982:
+; NoVLX-NEXT:  .Lcfi822:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -26847,12 +26207,12 @@ define zeroext i32 @test_vpcmpsgew_v8i1_
 ; NoVLX-LABEL: test_vpcmpsgew_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi983:
+; NoVLX-NEXT:  .Lcfi823:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi984:
+; NoVLX-NEXT:  .Lcfi824:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi985:
+; NoVLX-NEXT:  .Lcfi825:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -26927,12 +26287,12 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi986:
+; NoVLX-NEXT:  .Lcfi826:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi987:
+; NoVLX-NEXT:  .Lcfi827:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi988:
+; NoVLX-NEXT:  .Lcfi828:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -27008,12 +26368,12 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi989:
+; NoVLX-NEXT:  .Lcfi829:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi990:
+; NoVLX-NEXT:  .Lcfi830:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi991:
+; NoVLX-NEXT:  .Lcfi831:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -27091,12 +26451,12 @@ define zeroext i64 @test_vpcmpsgew_v8i1_
 ; NoVLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi992:
+; NoVLX-NEXT:  .Lcfi832:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi993:
+; NoVLX-NEXT:  .Lcfi833:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi994:
+; NoVLX-NEXT:  .Lcfi834:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -27106,43 +26466,43 @@ define zeroext i64 @test_vpcmpsgew_v8i1_
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -27173,12 +26533,12 @@ define zeroext i64 @test_vpcmpsgew_v8i1_
 ; NoVLX-LABEL: test_vpcmpsgew_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi995:
+; NoVLX-NEXT:  .Lcfi835:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi996:
+; NoVLX-NEXT:  .Lcfi836:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi997:
+; NoVLX-NEXT:  .Lcfi837:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -27189,43 +26549,43 @@ define zeroext i64 @test_vpcmpsgew_v8i1_
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -27258,12 +26618,12 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi998:
+; NoVLX-NEXT:  .Lcfi838:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi999:
+; NoVLX-NEXT:  .Lcfi839:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1000:
+; NoVLX-NEXT:  .Lcfi840:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -27274,43 +26634,43 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -27344,12 +26704,12 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1001:
+; NoVLX-NEXT:  .Lcfi841:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1002:
+; NoVLX-NEXT:  .Lcfi842:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1003:
+; NoVLX-NEXT:  .Lcfi843:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -27361,43 +26721,43 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -27433,30 +26793,15 @@ define zeroext i32 @test_vpcmpsgew_v16i1
 ; NoVLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1004:
+; NoVLX-NEXT:  .Lcfi844:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1005:
+; NoVLX-NEXT:  .Lcfi845:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1006:
+; NoVLX-NEXT:  .Lcfi846:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1007:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1008:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1009:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1010:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1011:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
@@ -27467,64 +26812,64 @@ define zeroext i32 @test_vpcmpsgew_v16i1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -27533,12 +26878,7 @@ define zeroext i32 @test_vpcmpsgew_v16i1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -27561,30 +26901,15 @@ define zeroext i32 @test_vpcmpsgew_v16i1
 ; NoVLX-LABEL: test_vpcmpsgew_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1012:
+; NoVLX-NEXT:  .Lcfi847:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1013:
+; NoVLX-NEXT:  .Lcfi848:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1014:
+; NoVLX-NEXT:  .Lcfi849:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1015:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1016:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1017:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1018:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1019:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -27596,64 +26921,64 @@ define zeroext i32 @test_vpcmpsgew_v16i1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -27662,12 +26987,7 @@ define zeroext i32 @test_vpcmpsgew_v16i1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -27692,30 +27012,15 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1020:
+; NoVLX-NEXT:  .Lcfi850:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1021:
+; NoVLX-NEXT:  .Lcfi851:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1022:
+; NoVLX-NEXT:  .Lcfi852:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1023:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1024:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1025:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1026:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1027:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
@@ -27727,64 +27032,64 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -27793,12 +27098,7 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -27824,30 +27124,15 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1028:
+; NoVLX-NEXT:  .Lcfi853:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1029:
+; NoVLX-NEXT:  .Lcfi854:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1030:
+; NoVLX-NEXT:  .Lcfi855:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1031:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1032:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1033:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1034:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1035:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -27860,64 +27145,64 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -27926,12 +27211,7 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -27958,12 +27238,12 @@ define zeroext i64 @test_vpcmpsgew_v16i1
 ; NoVLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1036:
+; NoVLX-NEXT:  .Lcfi856:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1037:
+; NoVLX-NEXT:  .Lcfi857:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1038:
+; NoVLX-NEXT:  .Lcfi858:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -27972,15 +27252,15 @@ define zeroext i64 @test_vpcmpsgew_v16i1
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1039:
+; NoVLX-NEXT:  .Lcfi859:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1040:
+; NoVLX-NEXT:  .Lcfi860:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1041:
+; NoVLX-NEXT:  .Lcfi861:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1042:
+; NoVLX-NEXT:  .Lcfi862:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1043:
+; NoVLX-NEXT:  .Lcfi863:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -27988,10 +27268,6 @@ define zeroext i64 @test_vpcmpsgew_v16i1
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -28034,11 +27310,11 @@ define zeroext i64 @test_vpcmpsgew_v16i1
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -28050,11 +27326,15 @@ define zeroext i64 @test_vpcmpsgew_v16i1
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -28091,12 +27371,12 @@ define zeroext i64 @test_vpcmpsgew_v16i1
 ; NoVLX-LABEL: test_vpcmpsgew_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1044:
+; NoVLX-NEXT:  .Lcfi864:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1045:
+; NoVLX-NEXT:  .Lcfi865:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1046:
+; NoVLX-NEXT:  .Lcfi866:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -28105,15 +27385,15 @@ define zeroext i64 @test_vpcmpsgew_v16i1
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1047:
+; NoVLX-NEXT:  .Lcfi867:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1048:
+; NoVLX-NEXT:  .Lcfi868:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1049:
+; NoVLX-NEXT:  .Lcfi869:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1050:
+; NoVLX-NEXT:  .Lcfi870:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1051:
+; NoVLX-NEXT:  .Lcfi871:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
@@ -28122,10 +27402,6 @@ define zeroext i64 @test_vpcmpsgew_v16i1
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -28168,11 +27444,11 @@ define zeroext i64 @test_vpcmpsgew_v16i1
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -28184,11 +27460,15 @@ define zeroext i64 @test_vpcmpsgew_v16i1
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -28227,12 +27507,12 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1052:
+; NoVLX-NEXT:  .Lcfi872:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1053:
+; NoVLX-NEXT:  .Lcfi873:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1054:
+; NoVLX-NEXT:  .Lcfi874:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -28241,15 +27521,15 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1055:
+; NoVLX-NEXT:  .Lcfi875:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1056:
+; NoVLX-NEXT:  .Lcfi876:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1057:
+; NoVLX-NEXT:  .Lcfi877:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1058:
+; NoVLX-NEXT:  .Lcfi878:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1059:
+; NoVLX-NEXT:  .Lcfi879:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -28258,10 +27538,6 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -28304,11 +27580,11 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -28320,11 +27596,15 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -28364,12 +27644,12 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1060:
+; NoVLX-NEXT:  .Lcfi880:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1061:
+; NoVLX-NEXT:  .Lcfi881:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1062:
+; NoVLX-NEXT:  .Lcfi882:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -28378,15 +27658,15 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1063:
+; NoVLX-NEXT:  .Lcfi883:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1064:
+; NoVLX-NEXT:  .Lcfi884:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1065:
+; NoVLX-NEXT:  .Lcfi885:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1066:
+; NoVLX-NEXT:  .Lcfi886:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1067:
+; NoVLX-NEXT:  .Lcfi887:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa (%rsi), %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
@@ -28396,10 +27676,6 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -28442,11 +27718,11 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -28458,11 +27734,15 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -28503,62 +27783,58 @@ define zeroext i64 @test_vpcmpsgew_v32i1
 ; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1068:
+; NoVLX-NEXT:  .Lcfi888:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1069:
+; NoVLX-NEXT:  .Lcfi889:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1070:
+; NoVLX-NEXT:  .Lcfi890:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
-; NoVLX-NEXT:    vmovq %xmm3, %rax
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    vmovd %eax, %xmm3
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm5
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm8
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm6
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm7
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm8
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm2, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm5
@@ -28566,79 +27842,82 @@ define zeroext i64 @test_vpcmpsgew_v32i1
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm4
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm7, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm5
+; NoVLX-NEXT:    vmovq %xmm5, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm0
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm5, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm4
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm6, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm5
+; NoVLX-NEXT:    vmovq %xmm5, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm7
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm6
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm6, %xmm6
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm6, %xmm6
+; NoVLX-NEXT:    vpextrq $1, %xmm5, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm6, %xmm5
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm5, %xmm5
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm5, %xmm5
 ; NoVLX-NEXT:    vmovq %xmm1, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm6
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm6
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm6, %xmm6
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm6, %xmm6
 ; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm6, %xmm6
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm6, %xmm6
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm6, %xmm6
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm6, %xmm6
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT:    vmovq %xmm7, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm6, %xmm6
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm2
@@ -28646,7 +27925,7 @@ define zeroext i64 @test_vpcmpsgew_v32i1
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
@@ -28654,34 +27933,35 @@ define zeroext i64 @test_vpcmpsgew_v32i1
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm8, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm3
-; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm1
+; NoVLX-NEXT:    vmovq %xmm1, %rax
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %eax, %xmm7
+; NoVLX-NEXT:    vpinsrw $1, %ecx, %xmm7, %xmm7
 ; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $2, %ecx, %xmm7, %xmm7
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rcx
+; NoVLX-NEXT:    vinserti128 $1, %xmm8, %ymm3, %ymm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm3
+; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm6, %ymm0
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $3, %eax, %xmm7, %xmm4
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vpinsrw $4, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $7, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpcmpgtw %ymm3, %ymm1, %ymm2
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
@@ -28857,69 +28137,68 @@ define zeroext i64 @test_vpcmpsgew_v32i1
 ; NoVLX-LABEL: test_vpcmpsgew_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1071:
+; NoVLX-NEXT:  .Lcfi891:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1072:
+; NoVLX-NEXT:  .Lcfi892:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1073:
+; NoVLX-NEXT:  .Lcfi893:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
-; NoVLX-NEXT:    vmovq %xmm2, %rax
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm1
+; NoVLX-NEXT:    vmovq %xmm1, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm1
+; NoVLX-NEXT:    vmovd %eax, %xmm2
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm3
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
 ; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
+; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm4
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
@@ -28927,7 +28206,8 @@ define zeroext i64 @test_vpcmpsgew_v32i1
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
@@ -28937,24 +28217,24 @@ define zeroext i64 @test_vpcmpsgew_v32i1
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm1
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; NoVLX-NEXT:    vmovdqa (%rdi), %ymm2
-; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
-; NoVLX-NEXT:    vmovdqa 32(%rdi), %ymm2
-; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm2, %ymm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm2
+; NoVLX-NEXT:    vmovdqa (%rdi), %ymm0
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT:    vmovdqa 32(%rdi), %ymm1
+; NoVLX-NEXT:    vpcmpgtw %ymm2, %ymm1, %ymm2
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm2, %ymm2
 ; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
@@ -29130,12 +28410,12 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1074:
+; NoVLX-NEXT:  .Lcfi894:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1075:
+; NoVLX-NEXT:  .Lcfi895:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1076:
+; NoVLX-NEXT:  .Lcfi896:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -29146,17 +28426,12 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    movq %rax, %rdx
 ; NoVLX-NEXT:    vmovd %eax, %xmm3
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm4
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm8
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm5
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm6
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
 ; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
@@ -29164,59 +28439,61 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
 ; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm9
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm4
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm4, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm6, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
+; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm4
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm6
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm6
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
@@ -29224,7 +28501,8 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm7, %rcx
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %ecx, %eax
@@ -29234,29 +28512,30 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm5, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
+; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm7
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm8
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm7
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm7, %xmm7
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpextrq $1, %xmm5, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm7, %xmm7
+; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm7, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
@@ -29264,47 +28543,18 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
-; NoVLX-NEXT:    vmovq %xmm8, %rcx
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm7
+; NoVLX-NEXT:    vmovq %xmm7, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm5
-; NoVLX-NEXT:    movl %eax, %ecx
-; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm5, %xmm5
-; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm5, %xmm5
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm2
 ; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
-; NoVLX-NEXT:    vinserti128 $1, %xmm9, %ymm4, %ymm1
-; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm4
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; NoVLX-NEXT:    vinserti128 $1, %xmm7, %ymm3, %ymm3
-; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm3
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
+; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
@@ -29316,7 +28566,31 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm2, %ymm2
+; NoVLX-NEXT:    vmovq %xmm1, %rax
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vmovd %eax, %xmm7
+; NoVLX-NEXT:    vpinsrw $1, %ecx, %xmm7, %xmm7
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $2, %ecx, %xmm7, %xmm7
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rcx
+; NoVLX-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm4
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $3, %eax, %xmm7, %xmm0
+; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    shrl $16, %eax
+; NoVLX-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT:    movq %rcx, %rax
+; NoVLX-NEXT:    shrq $32, %rax
+; NoVLX-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    shrq $48, %rcx
+; NoVLX-NEXT:    vpinsrw $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; NoVLX-NEXT:    vpcmpgtw %ymm4, %ymm2, %ymm2
 ; NoVLX-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
 ; NoVLX-NEXT:    vpxor %ymm4, %ymm2, %ymm2
@@ -29383,77 +28657,83 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vinserti128 $1, %xmm8, %ymm3, %ymm3
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpxor %ymm4, %ymm3, %ymm3
-; NoVLX-NEXT:    vpmovsxwd %ymm3, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm1
+; NoVLX-NEXT:    vpxor %ymm4, %ymm1, %ymm1
+; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpand %xmm1, %xmm3, %xmm1
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
@@ -29496,12 +28776,12 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgew_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1077:
+; NoVLX-NEXT:  .Lcfi897:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1078:
+; NoVLX-NEXT:  .Lcfi898:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1079:
+; NoVLX-NEXT:  .Lcfi899:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -29513,8 +28793,6 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vmovd %eax, %xmm2
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
 ; NoVLX-NEXT:    shrq $32, %rdx
 ; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
@@ -29527,19 +28805,20 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
@@ -29547,6 +28826,7 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
 ; NoVLX-NEXT:    vmovq %xmm3, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
@@ -29588,160 +28868,160 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm4
 ; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
-; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
 ; NoVLX-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm2
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; NoVLX-NEXT:    vpmovdb %zmm1, %xmm1
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; NoVLX-NEXT:    vmovdqa (%rsi), %ymm4
-; NoVLX-NEXT:    vpcmpgtw %ymm3, %ymm4, %ymm5
-; NoVLX-NEXT:    vmovdqa 32(%rsi), %ymm3
-; NoVLX-NEXT:    vpcmpgtw %ymm2, %ymm3, %ymm3
-; NoVLX-NEXT:    vpcmpeqd %ymm4, %ymm4, %ymm4
-; NoVLX-NEXT:    vpxor %ymm4, %ymm5, %ymm2
-; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
-; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
-; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
-; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
+; NoVLX-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm2
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm1
+; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; NoVLX-NEXT:    vmovdqa (%rsi), %ymm3
+; NoVLX-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm1
+; NoVLX-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
+; NoVLX-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; NoVLX-NEXT:    vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
+; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %eax, %xmm1
+; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovdqa 32(%rsi), %ymm4
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpxor %ymm4, %ymm3, %ymm3
-; NoVLX-NEXT:    vpmovsxwd %ymm3, %zmm3
-; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm3
-; NoVLX-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpcmpgtw %ymm2, %ymm4, %ymm2
+; NoVLX-NEXT:    vpxor %ymm3, %ymm2, %ymm2
+; NoVLX-NEXT:    vpmovsxwd %ymm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm3
-; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; NoVLX-NEXT:    vpmovdb %zmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpand %xmm1, %xmm3, %xmm1
-; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
-; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
-; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpmovdb %zmm3, %xmm3
+; NoVLX-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; NoVLX-NEXT:    vpmovsxbd %xmm2, %zmm2
+; NoVLX-NEXT:    vpslld $31, %zmm2, %zmm2
+; NoVLX-NEXT:    vptestmd %zmm2, %zmm2, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    vpand %xmm0, %xmm2, %xmm0
+; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -29798,8 +29078,8 @@ define zeroext i8 @test_vpcmpsged_v4i1_v
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -29856,8 +29136,8 @@ define zeroext i8 @test_vpcmpsged_v4i1_v
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -29896,7 +29176,6 @@ define zeroext i8 @test_masked_vpcmpsged
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -29908,13 +29187,14 @@ define zeroext i8 @test_masked_vpcmpsged
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -29931,8 +29211,8 @@ define zeroext i8 @test_masked_vpcmpsged
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -29974,7 +29254,6 @@ define zeroext i8 @test_masked_vpcmpsged
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -29986,13 +29265,14 @@ define zeroext i8 @test_masked_vpcmpsged
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -30009,8 +29289,8 @@ define zeroext i8 @test_masked_vpcmpsged
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -30073,8 +29353,8 @@ define zeroext i8 @test_vpcmpsged_v4i1_v
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -30116,7 +29396,6 @@ define zeroext i8 @test_masked_vpcmpsged
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v8i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -30128,13 +29407,14 @@ define zeroext i8 @test_masked_vpcmpsged
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -30151,8 +29431,8 @@ define zeroext i8 @test_masked_vpcmpsged
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -30213,8 +29493,8 @@ define zeroext i16 @test_vpcmpsged_v4i1_
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -30270,8 +29550,8 @@ define zeroext i16 @test_vpcmpsged_v4i1_
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -30310,7 +29590,6 @@ define zeroext i16 @test_masked_vpcmpsge
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -30322,13 +29601,14 @@ define zeroext i16 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -30344,8 +29624,8 @@ define zeroext i16 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -30387,7 +29667,6 @@ define zeroext i16 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -30399,13 +29678,14 @@ define zeroext i16 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -30421,8 +29701,8 @@ define zeroext i16 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -30484,8 +29764,8 @@ define zeroext i16 @test_vpcmpsged_v4i1_
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -30527,7 +29807,6 @@ define zeroext i16 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v16i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -30539,13 +29818,14 @@ define zeroext i16 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -30561,8 +29841,8 @@ define zeroext i16 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -30605,12 +29885,12 @@ define zeroext i32 @test_vpcmpsged_v4i1_
 ; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1080:
+; NoVLX-NEXT:  .Lcfi900:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1081:
+; NoVLX-NEXT:  .Lcfi901:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1082:
+; NoVLX-NEXT:  .Lcfi902:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -30650,12 +29930,12 @@ define zeroext i32 @test_vpcmpsged_v4i1_
 ; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1083:
+; NoVLX-NEXT:  .Lcfi903:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1084:
+; NoVLX-NEXT:  .Lcfi904:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1085:
+; NoVLX-NEXT:  .Lcfi905:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -30698,16 +29978,15 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1086:
+; NoVLX-NEXT:  .Lcfi906:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1087:
+; NoVLX-NEXT:  .Lcfi907:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1088:
+; NoVLX-NEXT:  .Lcfi908:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -30719,13 +29998,14 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -30763,17 +30043,16 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1089:
+; NoVLX-NEXT:  .Lcfi909:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1090:
+; NoVLX-NEXT:  .Lcfi910:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1091:
+; NoVLX-NEXT:  .Lcfi911:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -30785,13 +30064,14 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -30831,12 +30111,12 @@ define zeroext i32 @test_vpcmpsged_v4i1_
 ; NoVLX-LABEL: test_vpcmpsged_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1092:
+; NoVLX-NEXT:  .Lcfi912:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1093:
+; NoVLX-NEXT:  .Lcfi913:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1094:
+; NoVLX-NEXT:  .Lcfi914:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -30881,17 +30161,16 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1095:
+; NoVLX-NEXT:  .Lcfi915:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1096:
+; NoVLX-NEXT:  .Lcfi916:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1097:
+; NoVLX-NEXT:  .Lcfi917:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -30903,13 +30182,14 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k3, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k2, %eax
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    kmovw %k1, %eax
-; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -30949,12 +30229,12 @@ define zeroext i64 @test_vpcmpsged_v4i1_
 ; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1098:
+; NoVLX-NEXT:  .Lcfi918:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1099:
+; NoVLX-NEXT:  .Lcfi919:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1100:
+; NoVLX-NEXT:  .Lcfi920:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -30963,8 +30243,8 @@ define zeroext i64 @test_vpcmpsged_v4i1_
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -31000,12 +30280,12 @@ define zeroext i64 @test_vpcmpsged_v4i1_
 ; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1101:
+; NoVLX-NEXT:  .Lcfi921:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1102:
+; NoVLX-NEXT:  .Lcfi922:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1103:
+; NoVLX-NEXT:  .Lcfi923:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -31015,8 +30295,8 @@ define zeroext i64 @test_vpcmpsged_v4i1_
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -31054,19 +30334,18 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1104:
+; NoVLX-NEXT:  .Lcfi924:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1105:
+; NoVLX-NEXT:  .Lcfi925:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1106:
+; NoVLX-NEXT:  .Lcfi926:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -31079,13 +30358,14 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -31125,12 +30405,12 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1107:
+; NoVLX-NEXT:  .Lcfi927:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1108:
+; NoVLX-NEXT:  .Lcfi928:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1109:
+; NoVLX-NEXT:  .Lcfi929:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -31138,7 +30418,6 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -31151,13 +30430,14 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -31199,12 +30479,12 @@ define zeroext i64 @test_vpcmpsged_v4i1_
 ; NoVLX-LABEL: test_vpcmpsged_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1110:
+; NoVLX-NEXT:  .Lcfi930:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1111:
+; NoVLX-NEXT:  .Lcfi931:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1112:
+; NoVLX-NEXT:  .Lcfi932:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -31214,8 +30494,8 @@ define zeroext i64 @test_vpcmpsged_v4i1_
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -31255,12 +30535,12 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsged_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1113:
+; NoVLX-NEXT:  .Lcfi933:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1114:
+; NoVLX-NEXT:  .Lcfi934:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1115:
+; NoVLX-NEXT:  .Lcfi935:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -31268,7 +30548,6 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -31281,13 +30560,14 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -31520,12 +30800,12 @@ define zeroext i32 @test_vpcmpsged_v8i1_
 ; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1116:
+; NoVLX-NEXT:  .Lcfi936:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1117:
+; NoVLX-NEXT:  .Lcfi937:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1118:
+; NoVLX-NEXT:  .Lcfi938:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -31595,12 +30875,12 @@ define zeroext i32 @test_vpcmpsged_v8i1_
 ; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1119:
+; NoVLX-NEXT:  .Lcfi939:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1120:
+; NoVLX-NEXT:  .Lcfi940:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1121:
+; NoVLX-NEXT:  .Lcfi941:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -31672,12 +30952,12 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1122:
+; NoVLX-NEXT:  .Lcfi942:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1123:
+; NoVLX-NEXT:  .Lcfi943:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1124:
+; NoVLX-NEXT:  .Lcfi944:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -31752,12 +31032,12 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1125:
+; NoVLX-NEXT:  .Lcfi945:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1126:
+; NoVLX-NEXT:  .Lcfi946:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1127:
+; NoVLX-NEXT:  .Lcfi947:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -31834,12 +31114,12 @@ define zeroext i32 @test_vpcmpsged_v8i1_
 ; NoVLX-LABEL: test_vpcmpsged_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1128:
+; NoVLX-NEXT:  .Lcfi948:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1129:
+; NoVLX-NEXT:  .Lcfi949:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1130:
+; NoVLX-NEXT:  .Lcfi950:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -31913,12 +31193,12 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1131:
+; NoVLX-NEXT:  .Lcfi951:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1132:
+; NoVLX-NEXT:  .Lcfi952:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1133:
+; NoVLX-NEXT:  .Lcfi953:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -31995,55 +31275,55 @@ define zeroext i64 @test_vpcmpsged_v8i1_
 ; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1134:
+; NoVLX-NEXT:  .Lcfi954:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1135:
+; NoVLX-NEXT:  .Lcfi955:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1136:
+; NoVLX-NEXT:  .Lcfi956:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
 ; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -32075,55 +31355,55 @@ define zeroext i64 @test_vpcmpsged_v8i1_
 ; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1137:
+; NoVLX-NEXT:  .Lcfi957:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1138:
+; NoVLX-NEXT:  .Lcfi958:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1139:
+; NoVLX-NEXT:  .Lcfi959:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vmovdqa (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -32157,12 +31437,12 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1140:
+; NoVLX-NEXT:  .Lcfi960:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1141:
+; NoVLX-NEXT:  .Lcfi961:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1142:
+; NoVLX-NEXT:  .Lcfi962:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -32171,43 +31451,43 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    kandw %k1, %k0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -32242,12 +31522,12 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1143:
+; NoVLX-NEXT:  .Lcfi963:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1144:
+; NoVLX-NEXT:  .Lcfi964:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1145:
+; NoVLX-NEXT:  .Lcfi965:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -32256,43 +31536,43 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    kandw %k1, %k0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -32329,55 +31609,55 @@ define zeroext i64 @test_vpcmpsged_v8i1_
 ; NoVLX-LABEL: test_vpcmpsged_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1146:
+; NoVLX-NEXT:  .Lcfi966:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1147:
+; NoVLX-NEXT:  .Lcfi967:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1148:
+; NoVLX-NEXT:  .Lcfi968:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %ymm1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -32413,12 +31693,12 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsged_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1149:
+; NoVLX-NEXT:  .Lcfi969:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1150:
+; NoVLX-NEXT:  .Lcfi970:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1151:
+; NoVLX-NEXT:  .Lcfi971:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -32427,43 +31707,43 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    kandw %k0, %k1, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -32500,93 +31780,78 @@ define zeroext i32 @test_vpcmpsged_v16i1
 ; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1152:
+; NoVLX-NEXT:  .Lcfi972:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1153:
+; NoVLX-NEXT:  .Lcfi973:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1154:
+; NoVLX-NEXT:  .Lcfi974:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1155:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1156:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1157:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1158:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1159:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -32595,12 +31860,7 @@ define zeroext i32 @test_vpcmpsged_v16i1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -32623,93 +31883,78 @@ define zeroext i32 @test_vpcmpsged_v16i1
 ; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1160:
+; NoVLX-NEXT:  .Lcfi975:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1161:
+; NoVLX-NEXT:  .Lcfi976:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1162:
+; NoVLX-NEXT:  .Lcfi977:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1163:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1164:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1165:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1166:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1167:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpnltd (%rdi), %zmm0, %k0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -32718,12 +31963,7 @@ define zeroext i32 @test_vpcmpsged_v16i1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -32748,94 +31988,79 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1168:
+; NoVLX-NEXT:  .Lcfi978:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1169:
+; NoVLX-NEXT:  .Lcfi979:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1170:
+; NoVLX-NEXT:  .Lcfi980:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1171:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1172:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1173:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1174:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1175:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -32844,12 +32069,7 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -32875,94 +32095,79 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1176:
+; NoVLX-NEXT:  .Lcfi981:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1177:
+; NoVLX-NEXT:  .Lcfi982:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1178:
+; NoVLX-NEXT:  .Lcfi983:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1179:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1180:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1181:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1182:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1183:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpnltd (%rsi), %zmm0, %k0 {%k1}
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -32971,12 +32176,7 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -33004,94 +32204,79 @@ define zeroext i32 @test_vpcmpsged_v16i1
 ; NoVLX-LABEL: test_vpcmpsged_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1184:
+; NoVLX-NEXT:  .Lcfi984:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1185:
+; NoVLX-NEXT:  .Lcfi985:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1186:
+; NoVLX-NEXT:  .Lcfi986:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1187:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1188:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1189:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1190:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1191:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %zmm1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -33100,12 +32285,7 @@ define zeroext i32 @test_vpcmpsged_v16i1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -33132,30 +32312,15 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1192:
+; NoVLX-NEXT:  .Lcfi987:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1193:
+; NoVLX-NEXT:  .Lcfi988:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1194:
+; NoVLX-NEXT:  .Lcfi989:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1195:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1196:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1197:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1198:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1199:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %zmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0 {%k1}
@@ -33163,64 +32328,64 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -33229,12 +32394,7 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -33262,12 +32422,12 @@ define zeroext i64 @test_vpcmpsged_v16i1
 ; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1200:
+; NoVLX-NEXT:  .Lcfi990:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1201:
+; NoVLX-NEXT:  .Lcfi991:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1202:
+; NoVLX-NEXT:  .Lcfi992:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -33276,21 +32436,17 @@ define zeroext i64 @test_vpcmpsged_v16i1
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1203:
+; NoVLX-NEXT:  .Lcfi993:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1204:
+; NoVLX-NEXT:  .Lcfi994:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1205:
+; NoVLX-NEXT:  .Lcfi995:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1206:
+; NoVLX-NEXT:  .Lcfi996:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1207:
+; NoVLX-NEXT:  .Lcfi997:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -33333,11 +32489,11 @@ define zeroext i64 @test_vpcmpsged_v16i1
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -33349,11 +32505,15 @@ define zeroext i64 @test_vpcmpsged_v16i1
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -33390,12 +32550,12 @@ define zeroext i64 @test_vpcmpsged_v16i1
 ; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1208:
+; NoVLX-NEXT:  .Lcfi998:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1209:
+; NoVLX-NEXT:  .Lcfi999:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1210:
+; NoVLX-NEXT:  .Lcfi1000:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -33404,21 +32564,17 @@ define zeroext i64 @test_vpcmpsged_v16i1
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1211:
+; NoVLX-NEXT:  .Lcfi1001:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1212:
+; NoVLX-NEXT:  .Lcfi1002:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1213:
+; NoVLX-NEXT:  .Lcfi1003:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1214:
+; NoVLX-NEXT:  .Lcfi1004:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1215:
+; NoVLX-NEXT:  .Lcfi1005:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpcmpnltd (%rdi), %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -33461,11 +32617,11 @@ define zeroext i64 @test_vpcmpsged_v16i1
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -33477,11 +32633,15 @@ define zeroext i64 @test_vpcmpsged_v16i1
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -33520,12 +32680,12 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1216:
+; NoVLX-NEXT:  .Lcfi1006:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1217:
+; NoVLX-NEXT:  .Lcfi1007:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1218:
+; NoVLX-NEXT:  .Lcfi1008:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -33534,22 +32694,18 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1219:
+; NoVLX-NEXT:  .Lcfi1009:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1220:
+; NoVLX-NEXT:  .Lcfi1010:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1221:
+; NoVLX-NEXT:  .Lcfi1011:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1222:
+; NoVLX-NEXT:  .Lcfi1012:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1223:
+; NoVLX-NEXT:  .Lcfi1013:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -33592,11 +32748,11 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -33608,11 +32764,15 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -33652,12 +32812,12 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1224:
+; NoVLX-NEXT:  .Lcfi1014:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1225:
+; NoVLX-NEXT:  .Lcfi1015:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1226:
+; NoVLX-NEXT:  .Lcfi1016:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -33666,22 +32826,18 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1227:
+; NoVLX-NEXT:  .Lcfi1017:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1228:
+; NoVLX-NEXT:  .Lcfi1018:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1229:
+; NoVLX-NEXT:  .Lcfi1019:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1230:
+; NoVLX-NEXT:  .Lcfi1020:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1231:
+; NoVLX-NEXT:  .Lcfi1021:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpnltd (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -33724,11 +32880,11 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -33740,11 +32896,15 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -33786,12 +32946,12 @@ define zeroext i64 @test_vpcmpsged_v16i1
 ; NoVLX-LABEL: test_vpcmpsged_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1232:
+; NoVLX-NEXT:  .Lcfi1022:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1233:
+; NoVLX-NEXT:  .Lcfi1023:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1234:
+; NoVLX-NEXT:  .Lcfi1024:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -33800,22 +32960,18 @@ define zeroext i64 @test_vpcmpsged_v16i1
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1235:
+; NoVLX-NEXT:  .Lcfi1025:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1236:
+; NoVLX-NEXT:  .Lcfi1026:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1237:
+; NoVLX-NEXT:  .Lcfi1027:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1238:
+; NoVLX-NEXT:  .Lcfi1028:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1239:
+; NoVLX-NEXT:  .Lcfi1029:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpbroadcastd (%rdi), %zmm1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -33858,11 +33014,11 @@ define zeroext i64 @test_vpcmpsged_v16i1
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -33874,11 +33030,15 @@ define zeroext i64 @test_vpcmpsged_v16i1
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -33919,12 +33079,12 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsged_v16i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1240:
+; NoVLX-NEXT:  .Lcfi1030:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1241:
+; NoVLX-NEXT:  .Lcfi1031:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1242:
+; NoVLX-NEXT:  .Lcfi1032:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -33933,23 +33093,19 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1243:
+; NoVLX-NEXT:  .Lcfi1033:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1244:
+; NoVLX-NEXT:  .Lcfi1034:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1245:
+; NoVLX-NEXT:  .Lcfi1035:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1246:
+; NoVLX-NEXT:  .Lcfi1036:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1247:
+; NoVLX-NEXT:  .Lcfi1037:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vpbroadcastd (%rsi), %zmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpled %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -33992,11 +33148,11 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -34008,11 +33164,15 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -34114,7 +33274,6 @@ define zeroext i4 @test_masked_vpcmpsgeq
 ;
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask:
 ; NoVLX:       # BB#0: # %entry
-; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -34122,9 +33281,10 @@ define zeroext i4 @test_masked_vpcmpsgeq
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -34156,7 +33316,6 @@ define zeroext i4 @test_masked_vpcmpsgeq
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -34164,9 +33323,10 @@ define zeroext i4 @test_masked_vpcmpsgeq
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -34234,7 +33394,6 @@ define zeroext i4 @test_masked_vpcmpsgeq
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v4i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -34242,9 +33401,10 @@ define zeroext i4 @test_masked_vpcmpsgeq
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; NoVLX-NEXT:    vpslld $31, %ymm0, %ymm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -34863,12 +34023,12 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_
 ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1248:
+; NoVLX-NEXT:  .Lcfi1038:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1249:
+; NoVLX-NEXT:  .Lcfi1039:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1250:
+; NoVLX-NEXT:  .Lcfi1040:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -34908,12 +34068,12 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_
 ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1251:
+; NoVLX-NEXT:  .Lcfi1041:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1252:
+; NoVLX-NEXT:  .Lcfi1042:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1253:
+; NoVLX-NEXT:  .Lcfi1043:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -34956,16 +34116,15 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1254:
+; NoVLX-NEXT:  .Lcfi1044:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1255:
+; NoVLX-NEXT:  .Lcfi1045:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1256:
+; NoVLX-NEXT:  .Lcfi1046:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -34973,9 +34132,10 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -35013,17 +34173,16 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1257:
+; NoVLX-NEXT:  .Lcfi1047:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1258:
+; NoVLX-NEXT:  .Lcfi1048:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1259:
+; NoVLX-NEXT:  .Lcfi1049:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vmovdqa (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -35031,9 +34190,10 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -35073,12 +34233,12 @@ define zeroext i32 @test_vpcmpsgeq_v2i1_
 ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1260:
+; NoVLX-NEXT:  .Lcfi1050:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1261:
+; NoVLX-NEXT:  .Lcfi1051:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1262:
+; NoVLX-NEXT:  .Lcfi1052:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -35123,17 +34283,16 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1263:
+; NoVLX-NEXT:  .Lcfi1053:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1264:
+; NoVLX-NEXT:  .Lcfi1054:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1265:
+; NoVLX-NEXT:  .Lcfi1055:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %xmm1
-; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -35141,9 +34300,10 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vmovd %ecx, %xmm1
-; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovd %ecx, %xmm2
+; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpandn %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -35183,12 +34343,12 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_
 ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1266:
+; NoVLX-NEXT:  .Lcfi1056:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1267:
+; NoVLX-NEXT:  .Lcfi1057:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1268:
+; NoVLX-NEXT:  .Lcfi1058:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -35197,8 +34357,8 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -35234,12 +34394,12 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_
 ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1269:
+; NoVLX-NEXT:  .Lcfi1059:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1270:
+; NoVLX-NEXT:  .Lcfi1060:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1271:
+; NoVLX-NEXT:  .Lcfi1061:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -35249,8 +34409,8 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -35288,12 +34448,12 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1272:
+; NoVLX-NEXT:  .Lcfi1062:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1273:
+; NoVLX-NEXT:  .Lcfi1063:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1274:
+; NoVLX-NEXT:  .Lcfi1064:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -35310,8 +34470,8 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -35351,12 +34511,12 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1275:
+; NoVLX-NEXT:  .Lcfi1065:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1276:
+; NoVLX-NEXT:  .Lcfi1066:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1277:
+; NoVLX-NEXT:  .Lcfi1067:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -35374,8 +34534,8 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -35417,12 +34577,12 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_
 ; NoVLX-LABEL: test_vpcmpsgeq_v2i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1278:
+; NoVLX-NEXT:  .Lcfi1068:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1279:
+; NoVLX-NEXT:  .Lcfi1069:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1280:
+; NoVLX-NEXT:  .Lcfi1070:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -35432,8 +34592,8 @@ define zeroext i64 @test_vpcmpsgeq_v2i1_
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -35473,12 +34633,12 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v2i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1281:
+; NoVLX-NEXT:  .Lcfi1071:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1282:
+; NoVLX-NEXT:  .Lcfi1072:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1283:
+; NoVLX-NEXT:  .Lcfi1073:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -35496,8 +34656,8 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -35560,8 +34720,8 @@ define zeroext i8 @test_vpcmpsgeq_v4i1_v
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -35620,8 +34780,8 @@ define zeroext i8 @test_vpcmpsgeq_v4i1_v
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -35664,7 +34824,6 @@ define zeroext i8 @test_masked_vpcmpsgeq
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -35682,6 +34841,7 @@ define zeroext i8 @test_masked_vpcmpsgeq
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -35699,8 +34859,8 @@ define zeroext i8 @test_masked_vpcmpsgeq
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -35746,7 +34906,6 @@ define zeroext i8 @test_masked_vpcmpsgeq
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -35764,6 +34923,7 @@ define zeroext i8 @test_masked_vpcmpsgeq
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -35781,8 +34941,8 @@ define zeroext i8 @test_masked_vpcmpsgeq
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -35847,8 +35007,8 @@ define zeroext i8 @test_vpcmpsgeq_v4i1_v
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -35894,7 +35054,6 @@ define zeroext i8 @test_masked_vpcmpsgeq
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -35912,6 +35071,7 @@ define zeroext i8 @test_masked_vpcmpsgeq
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -35929,8 +35089,8 @@ define zeroext i8 @test_masked_vpcmpsgeq
 ; NoVLX-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
@@ -35993,8 +35153,8 @@ define zeroext i16 @test_vpcmpsgeq_v4i1_
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -36052,8 +35212,8 @@ define zeroext i16 @test_vpcmpsgeq_v4i1_
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -36096,7 +35256,6 @@ define zeroext i16 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -36114,6 +35273,7 @@ define zeroext i16 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -36130,8 +35290,8 @@ define zeroext i16 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -36177,7 +35337,6 @@ define zeroext i16 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -36195,6 +35354,7 @@ define zeroext i16 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -36211,8 +35371,8 @@ define zeroext i16 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -36276,8 +35436,8 @@ define zeroext i16 @test_vpcmpsgeq_v4i1_
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -36323,7 +35483,6 @@ define zeroext i16 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -36341,6 +35500,7 @@ define zeroext i16 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpextrb $4, %xmm0, %eax
 ; NoVLX-NEXT:    kmovw %eax, %k1
@@ -36357,8 +35517,8 @@ define zeroext i16 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; NoVLX-NEXT:    vpslld $31, %zmm3, %zmm1
 ; NoVLX-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    vpextrb $8, %xmm0, %eax
+; NoVLX-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; NoVLX-NEXT:    kmovw %eax, %k1
 ; NoVLX-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; NoVLX-NEXT:    vmovdqa32 {{.*#+}} zmm3 = [0,1,16,3,4,5,6,7,8,9,10,11,12,13,14,15]
@@ -36402,12 +35562,12 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_
 ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1284:
+; NoVLX-NEXT:  .Lcfi1074:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1285:
+; NoVLX-NEXT:  .Lcfi1075:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1286:
+; NoVLX-NEXT:  .Lcfi1076:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -36449,12 +35609,12 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_
 ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1287:
+; NoVLX-NEXT:  .Lcfi1077:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1288:
+; NoVLX-NEXT:  .Lcfi1078:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1289:
+; NoVLX-NEXT:  .Lcfi1079:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -36499,19 +35659,18 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1290:
+; NoVLX-NEXT:  .Lcfi1080:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1291:
+; NoVLX-NEXT:  .Lcfi1081:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1292:
+; NoVLX-NEXT:  .Lcfi1082:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -36529,6 +35688,7 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -36568,12 +35728,12 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1293:
+; NoVLX-NEXT:  .Lcfi1083:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1294:
+; NoVLX-NEXT:  .Lcfi1084:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1295:
+; NoVLX-NEXT:  .Lcfi1085:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -36581,7 +35741,6 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -36599,6 +35758,7 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -36640,12 +35800,12 @@ define zeroext i32 @test_vpcmpsgeq_v4i1_
 ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1296:
+; NoVLX-NEXT:  .Lcfi1086:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1297:
+; NoVLX-NEXT:  .Lcfi1087:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1298:
+; NoVLX-NEXT:  .Lcfi1088:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -36692,12 +35852,12 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1299:
+; NoVLX-NEXT:  .Lcfi1089:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1300:
+; NoVLX-NEXT:  .Lcfi1090:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1301:
+; NoVLX-NEXT:  .Lcfi1091:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -36705,7 +35865,6 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm0
 ; NoVLX-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
@@ -36723,6 +35882,7 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -36764,12 +35924,12 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_
 ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1302:
+; NoVLX-NEXT:  .Lcfi1092:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1303:
+; NoVLX-NEXT:  .Lcfi1093:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1304:
+; NoVLX-NEXT:  .Lcfi1094:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -36779,8 +35939,8 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -36817,12 +35977,12 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_
 ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1305:
+; NoVLX-NEXT:  .Lcfi1095:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1306:
+; NoVLX-NEXT:  .Lcfi1096:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1307:
+; NoVLX-NEXT:  .Lcfi1097:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -36833,8 +35993,8 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -36873,12 +36033,12 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1308:
+; NoVLX-NEXT:  .Lcfi1098:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1309:
+; NoVLX-NEXT:  .Lcfi1099:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1310:
+; NoVLX-NEXT:  .Lcfi1100:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -36888,7 +36048,6 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -36901,13 +36060,14 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -36948,12 +36108,12 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1311:
+; NoVLX-NEXT:  .Lcfi1101:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1312:
+; NoVLX-NEXT:  .Lcfi1102:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1313:
+; NoVLX-NEXT:  .Lcfi1103:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -36964,7 +36124,6 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -36977,13 +36136,14 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -37026,12 +36186,12 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_
 ; NoVLX-LABEL: test_vpcmpsgeq_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1314:
+; NoVLX-NEXT:  .Lcfi1104:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1315:
+; NoVLX-NEXT:  .Lcfi1105:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1316:
+; NoVLX-NEXT:  .Lcfi1106:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -37042,8 +36202,8 @@ define zeroext i64 @test_vpcmpsgeq_v4i1_
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -37084,12 +36244,12 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v4i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1317:
+; NoVLX-NEXT:  .Lcfi1107:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1318:
+; NoVLX-NEXT:  .Lcfi1108:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1319:
+; NoVLX-NEXT:  .Lcfi1109:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -37100,7 +36260,6 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpmovqd %zmm0, %ymm0
 ; NoVLX-NEXT:    kmovw %edi, %k0
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
-; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k2
 ; NoVLX-NEXT:    kshiftrw $15, %k2, %k2
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k3
@@ -37113,13 +36272,14 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    kmovw %k2, %eax
 ; NoVLX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kshiftrw $15, %k1, %k0
+; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
 ; NoVLX-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; NoVLX-NEXT:    kxorw %k0, %k0, %k0
 ; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,4,8,12],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -37330,12 +36490,12 @@ define zeroext i32 @test_vpcmpsgeq_v8i1_
 ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1320:
+; NoVLX-NEXT:  .Lcfi1110:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1321:
+; NoVLX-NEXT:  .Lcfi1111:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1322:
+; NoVLX-NEXT:  .Lcfi1112:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -37403,12 +36563,12 @@ define zeroext i32 @test_vpcmpsgeq_v8i1_
 ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1323:
+; NoVLX-NEXT:  .Lcfi1113:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1324:
+; NoVLX-NEXT:  .Lcfi1114:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1325:
+; NoVLX-NEXT:  .Lcfi1115:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -37478,12 +36638,12 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1326:
+; NoVLX-NEXT:  .Lcfi1116:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1327:
+; NoVLX-NEXT:  .Lcfi1117:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1328:
+; NoVLX-NEXT:  .Lcfi1118:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -37555,12 +36715,12 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1329:
+; NoVLX-NEXT:  .Lcfi1119:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1330:
+; NoVLX-NEXT:  .Lcfi1120:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1331:
+; NoVLX-NEXT:  .Lcfi1121:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -37634,12 +36794,12 @@ define zeroext i32 @test_vpcmpsgeq_v8i1_
 ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1332:
+; NoVLX-NEXT:  .Lcfi1122:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1333:
+; NoVLX-NEXT:  .Lcfi1123:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1334:
+; NoVLX-NEXT:  .Lcfi1124:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -37712,12 +36872,12 @@ define zeroext i32 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v32i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1335:
+; NoVLX-NEXT:  .Lcfi1125:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1336:
+; NoVLX-NEXT:  .Lcfi1126:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1337:
+; NoVLX-NEXT:  .Lcfi1127:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -37792,53 +36952,53 @@ define zeroext i64 @test_vpcmpsgeq_v8i1_
 ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1338:
+; NoVLX-NEXT:  .Lcfi1128:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1339:
+; NoVLX-NEXT:  .Lcfi1129:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1340:
+; NoVLX-NEXT:  .Lcfi1130:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -37870,53 +37030,53 @@ define zeroext i64 @test_vpcmpsgeq_v8i1_
 ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1341:
+; NoVLX-NEXT:  .Lcfi1131:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1342:
+; NoVLX-NEXT:  .Lcfi1132:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1343:
+; NoVLX-NEXT:  .Lcfi1133:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpcmpnltq (%rdi), %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -37950,54 +37110,54 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1344:
+; NoVLX-NEXT:  .Lcfi1134:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1345:
+; NoVLX-NEXT:  .Lcfi1135:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1346:
+; NoVLX-NEXT:  .Lcfi1136:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -38032,54 +37192,54 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1347:
+; NoVLX-NEXT:  .Lcfi1137:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1348:
+; NoVLX-NEXT:  .Lcfi1138:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1349:
+; NoVLX-NEXT:  .Lcfi1139:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpnltq (%rsi), %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -38116,54 +37276,54 @@ define zeroext i64 @test_vpcmpsgeq_v8i1_
 ; NoVLX-LABEL: test_vpcmpsgeq_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1350:
+; NoVLX-NEXT:  .Lcfi1140:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1351:
+; NoVLX-NEXT:  .Lcfi1141:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1352:
+; NoVLX-NEXT:  .Lcfi1142:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpbroadcastq (%rdi), %zmm1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -38199,55 +37359,55 @@ define zeroext i64 @test_masked_vpcmpsge
 ; NoVLX-LABEL: test_masked_vpcmpsgeq_v8i1_v64i1_mask_mem_b:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1353:
+; NoVLX-NEXT:  .Lcfi1143:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1354:
+; NoVLX-NEXT:  .Lcfi1144:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1355:
+; NoVLX-NEXT:  .Lcfi1145:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vpbroadcastq (%rsi), %zmm1
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vpcmpleq %zmm0, %zmm1, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -38283,30 +37443,15 @@ define zeroext i32 @test_vpcmpultb_v16i1
 ; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1356:
+; NoVLX-NEXT:  .Lcfi1146:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1357:
+; NoVLX-NEXT:  .Lcfi1147:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1358:
+; NoVLX-NEXT:  .Lcfi1148:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1359:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1360:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1361:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1362:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1363:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
@@ -38318,64 +37463,64 @@ define zeroext i32 @test_vpcmpultb_v16i1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -38384,12 +37529,7 @@ define zeroext i32 @test_vpcmpultb_v16i1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -38411,30 +37551,15 @@ define zeroext i32 @test_vpcmpultb_v16i1
 ; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1364:
+; NoVLX-NEXT:  .Lcfi1149:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1365:
+; NoVLX-NEXT:  .Lcfi1150:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1366:
+; NoVLX-NEXT:  .Lcfi1151:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1367:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1368:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1369:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1370:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1371:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm1
@@ -38446,64 +37571,64 @@ define zeroext i32 @test_vpcmpultb_v16i1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -38512,12 +37637,7 @@ define zeroext i32 @test_vpcmpultb_v16i1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -38541,30 +37661,15 @@ define zeroext i32 @test_masked_vpcmpult
 ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1372:
+; NoVLX-NEXT:  .Lcfi1152:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1373:
+; NoVLX-NEXT:  .Lcfi1153:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1374:
+; NoVLX-NEXT:  .Lcfi1154:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1375:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1376:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1377:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1378:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1379:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm1, %xmm1
@@ -38577,64 +37682,64 @@ define zeroext i32 @test_masked_vpcmpult
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -38643,12 +37748,7 @@ define zeroext i32 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -38673,30 +37773,15 @@ define zeroext i32 @test_masked_vpcmpult
 ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1380:
+; NoVLX-NEXT:  .Lcfi1155:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1381:
+; NoVLX-NEXT:  .Lcfi1156:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1382:
+; NoVLX-NEXT:  .Lcfi1157:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1383:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1384:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1385:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1386:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1387:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpxor (%rsi), %xmm1, %xmm1
@@ -38709,64 +37794,64 @@ define zeroext i32 @test_masked_vpcmpult
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -38775,12 +37860,7 @@ define zeroext i32 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -38806,12 +37886,12 @@ define zeroext i64 @test_vpcmpultb_v16i1
 ; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1388:
+; NoVLX-NEXT:  .Lcfi1158:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1389:
+; NoVLX-NEXT:  .Lcfi1159:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1390:
+; NoVLX-NEXT:  .Lcfi1160:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -38820,15 +37900,15 @@ define zeroext i64 @test_vpcmpultb_v16i1
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1391:
+; NoVLX-NEXT:  .Lcfi1161:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1392:
+; NoVLX-NEXT:  .Lcfi1162:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1393:
+; NoVLX-NEXT:  .Lcfi1163:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1394:
+; NoVLX-NEXT:  .Lcfi1164:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1395:
+; NoVLX-NEXT:  .Lcfi1165:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
@@ -38837,10 +37917,6 @@ define zeroext i64 @test_vpcmpultb_v16i1
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -38883,11 +37959,11 @@ define zeroext i64 @test_vpcmpultb_v16i1
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -38899,11 +37975,15 @@ define zeroext i64 @test_vpcmpultb_v16i1
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -38939,12 +38019,12 @@ define zeroext i64 @test_vpcmpultb_v16i1
 ; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1396:
+; NoVLX-NEXT:  .Lcfi1166:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1397:
+; NoVLX-NEXT:  .Lcfi1167:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1398:
+; NoVLX-NEXT:  .Lcfi1168:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -38953,27 +38033,23 @@ define zeroext i64 @test_vpcmpultb_v16i1
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1399:
+; NoVLX-NEXT:  .Lcfi1169:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1400:
+; NoVLX-NEXT:  .Lcfi1170:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1401:
+; NoVLX-NEXT:  .Lcfi1171:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1402:
+; NoVLX-NEXT:  .Lcfi1172:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1403:
+; NoVLX-NEXT:  .Lcfi1173:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm2
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -39016,11 +38092,11 @@ define zeroext i64 @test_vpcmpultb_v16i1
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -39032,11 +38108,15 @@ define zeroext i64 @test_vpcmpultb_v16i1
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -39074,12 +38154,12 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1404:
+; NoVLX-NEXT:  .Lcfi1174:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1405:
+; NoVLX-NEXT:  .Lcfi1175:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1406:
+; NoVLX-NEXT:  .Lcfi1176:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -39088,15 +38168,15 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1407:
+; NoVLX-NEXT:  .Lcfi1177:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1408:
+; NoVLX-NEXT:  .Lcfi1178:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1409:
+; NoVLX-NEXT:  .Lcfi1179:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1410:
+; NoVLX-NEXT:  .Lcfi1180:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1411:
+; NoVLX-NEXT:  .Lcfi1181:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; NoVLX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
@@ -39106,10 +38186,6 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -39152,11 +38228,11 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -39168,11 +38244,15 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -39211,12 +38291,12 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1412:
+; NoVLX-NEXT:  .Lcfi1182:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1413:
+; NoVLX-NEXT:  .Lcfi1183:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1414:
+; NoVLX-NEXT:  .Lcfi1184:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -39225,28 +38305,24 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1415:
+; NoVLX-NEXT:  .Lcfi1185:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1416:
+; NoVLX-NEXT:  .Lcfi1186:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1417:
+; NoVLX-NEXT:  .Lcfi1187:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1418:
+; NoVLX-NEXT:  .Lcfi1188:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1419:
+; NoVLX-NEXT:  .Lcfi1189:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT:    vpxor (%rsi), %xmm1, %xmm2
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    vpxor (%rsi), %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -39289,11 +38365,11 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -39305,11 +38381,15 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -39350,12 +38430,12 @@ define zeroext i64 @test_vpcmpultb_v32i1
 ; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1420:
+; NoVLX-NEXT:  .Lcfi1190:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1421:
+; NoVLX-NEXT:  .Lcfi1191:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1422:
+; NoVLX-NEXT:  .Lcfi1192:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -39402,19 +38482,19 @@ define zeroext i64 @test_vpcmpultb_v32i1
 ; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1423:
+; NoVLX-NEXT:  .Lcfi1193:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1424:
+; NoVLX-NEXT:  .Lcfi1194:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1425:
+; NoVLX-NEXT:  .Lcfi1195:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; NoVLX-NEXT:    vpxor (%rdi), %ymm1, %ymm2
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpxor (%rdi), %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm0
 ; NoVLX-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; NoVLX-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; NoVLX-NEXT:    vpslld $31, %zmm1, %zmm1
@@ -39456,12 +38536,12 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1426:
+; NoVLX-NEXT:  .Lcfi1196:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1427:
+; NoVLX-NEXT:  .Lcfi1197:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1428:
+; NoVLX-NEXT:  .Lcfi1198:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -39520,12 +38600,12 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1429:
+; NoVLX-NEXT:  .Lcfi1199:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1430:
+; NoVLX-NEXT:  .Lcfi1200:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1431:
+; NoVLX-NEXT:  .Lcfi1201:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $96, %rsp
@@ -39719,12 +38799,12 @@ define zeroext i32 @test_vpcmpultw_v8i1_
 ; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1432:
+; NoVLX-NEXT:  .Lcfi1202:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1433:
+; NoVLX-NEXT:  .Lcfi1203:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1434:
+; NoVLX-NEXT:  .Lcfi1204:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -39797,12 +38877,12 @@ define zeroext i32 @test_vpcmpultw_v8i1_
 ; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1435:
+; NoVLX-NEXT:  .Lcfi1205:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1436:
+; NoVLX-NEXT:  .Lcfi1206:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1437:
+; NoVLX-NEXT:  .Lcfi1207:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -39877,12 +38957,12 @@ define zeroext i32 @test_masked_vpcmpult
 ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1438:
+; NoVLX-NEXT:  .Lcfi1208:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1439:
+; NoVLX-NEXT:  .Lcfi1209:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1440:
+; NoVLX-NEXT:  .Lcfi1210:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -39959,12 +39039,12 @@ define zeroext i32 @test_masked_vpcmpult
 ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1441:
+; NoVLX-NEXT:  .Lcfi1211:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1442:
+; NoVLX-NEXT:  .Lcfi1212:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1443:
+; NoVLX-NEXT:  .Lcfi1213:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
@@ -40042,12 +39122,12 @@ define zeroext i64 @test_vpcmpultw_v8i1_
 ; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1444:
+; NoVLX-NEXT:  .Lcfi1214:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1445:
+; NoVLX-NEXT:  .Lcfi1215:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1446:
+; NoVLX-NEXT:  .Lcfi1216:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -40058,43 +39138,43 @@ define zeroext i64 @test_vpcmpultw_v8i1_
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -40125,59 +39205,59 @@ define zeroext i64 @test_vpcmpultw_v8i1_
 ; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1447:
+; NoVLX-NEXT:  .Lcfi1217:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1448:
+; NoVLX-NEXT:  .Lcfi1218:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1449:
+; NoVLX-NEXT:  .Lcfi1219:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm2
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    vpxor (%rdi), %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -40210,12 +39290,12 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1450:
+; NoVLX-NEXT:  .Lcfi1220:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1451:
+; NoVLX-NEXT:  .Lcfi1221:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1452:
+; NoVLX-NEXT:  .Lcfi1222:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
@@ -40227,43 +39307,43 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -40297,60 +39377,60 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1453:
+; NoVLX-NEXT:  .Lcfi1223:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1454:
+; NoVLX-NEXT:  .Lcfi1224:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1455:
+; NoVLX-NEXT:  .Lcfi1225:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT:    vpxor (%rsi), %xmm1, %xmm2
 ; NoVLX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT:    vpxor (%rsi), %xmm1, %xmm1
-; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT:    vpcmpgtw %xmm0, %xmm2, %xmm0
 ; NoVLX-NEXT:    vpmovsxwq %xmm0, %zmm0
 ; NoVLX-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmq %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
+; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %eax
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $0, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $1, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $2, %edx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %edi, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k0
+; NoVLX-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k0, %eax
-; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, %ecx
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -40386,30 +39466,15 @@ define zeroext i32 @test_vpcmpultw_v16i1
 ; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1456:
+; NoVLX-NEXT:  .Lcfi1226:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1457:
+; NoVLX-NEXT:  .Lcfi1227:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1458:
+; NoVLX-NEXT:  .Lcfi1228:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1459:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1460:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1461:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1462:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1463:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
@@ -40421,64 +39486,64 @@ define zeroext i32 @test_vpcmpultw_v16i1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -40487,12 +39552,7 @@ define zeroext i32 @test_vpcmpultw_v16i1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -40515,30 +39575,15 @@ define zeroext i32 @test_vpcmpultw_v16i1
 ; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1464:
+; NoVLX-NEXT:  .Lcfi1229:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1465:
+; NoVLX-NEXT:  .Lcfi1230:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1466:
+; NoVLX-NEXT:  .Lcfi1231:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1467:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1468:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1469:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1470:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1471:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpxor (%rdi), %ymm1, %ymm1
@@ -40550,64 +39595,64 @@ define zeroext i32 @test_vpcmpultw_v16i1
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -40616,12 +39661,7 @@ define zeroext i32 @test_vpcmpultw_v16i1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -40646,30 +39686,15 @@ define zeroext i32 @test_masked_vpcmpult
 ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1472:
+; NoVLX-NEXT:  .Lcfi1232:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1473:
+; NoVLX-NEXT:  .Lcfi1233:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1474:
+; NoVLX-NEXT:  .Lcfi1234:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1475:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1476:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1477:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1478:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1479:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm1, %ymm1
@@ -40682,64 +39707,64 @@ define zeroext i32 @test_masked_vpcmpult
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -40748,12 +39773,7 @@ define zeroext i32 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -40779,30 +39799,15 @@ define zeroext i32 @test_masked_vpcmpult
 ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1480:
+; NoVLX-NEXT:  .Lcfi1235:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1481:
+; NoVLX-NEXT:  .Lcfi1236:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1482:
+; NoVLX-NEXT:  .Lcfi1237:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
-; NoVLX-NEXT:    pushq %r15
-; NoVLX-NEXT:    pushq %r14
-; NoVLX-NEXT:    pushq %r13
-; NoVLX-NEXT:    pushq %r12
-; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $32, %rsp
-; NoVLX-NEXT:  .Lcfi1483:
-; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1484:
-; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1485:
-; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1486:
-; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1487:
-; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; NoVLX-NEXT:    vpxor (%rsi), %ymm1, %ymm1
@@ -40815,64 +39820,64 @@ define zeroext i32 @test_masked_vpcmpult
 ; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r8d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $15, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $13, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r9d
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $12, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r11d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $11, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r14d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $10, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r15d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $9, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r12d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $8, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %r13d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $7, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %esi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $6, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %ebx
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $5, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edi
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $4, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $3, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    kmovw %k1, %edx
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %eax
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
+; NoVLX-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
-; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k1, %ecx
-; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $8, %esi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $9, %ebx, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -40881,12 +39886,7 @@ define zeroext i32 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; NoVLX-NEXT:    kmovw %k0, (%rsp)
 ; NoVLX-NEXT:    movl (%rsp), %eax
-; NoVLX-NEXT:    leaq -40(%rbp), %rsp
-; NoVLX-NEXT:    popq %rbx
-; NoVLX-NEXT:    popq %r12
-; NoVLX-NEXT:    popq %r13
-; NoVLX-NEXT:    popq %r14
-; NoVLX-NEXT:    popq %r15
+; NoVLX-NEXT:    movq %rbp, %rsp
 ; NoVLX-NEXT:    popq %rbp
 ; NoVLX-NEXT:    retq
 entry:
@@ -40913,12 +39913,12 @@ define zeroext i64 @test_vpcmpultw_v16i1
 ; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1488:
+; NoVLX-NEXT:  .Lcfi1238:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1489:
+; NoVLX-NEXT:  .Lcfi1239:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1490:
+; NoVLX-NEXT:  .Lcfi1240:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -40927,15 +39927,15 @@ define zeroext i64 @test_vpcmpultw_v16i1
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1491:
+; NoVLX-NEXT:  .Lcfi1241:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1492:
+; NoVLX-NEXT:  .Lcfi1242:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1493:
+; NoVLX-NEXT:  .Lcfi1243:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1494:
+; NoVLX-NEXT:  .Lcfi1244:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1495:
+; NoVLX-NEXT:  .Lcfi1245:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
@@ -40944,10 +39944,6 @@ define zeroext i64 @test_vpcmpultw_v16i1
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -40990,11 +39986,11 @@ define zeroext i64 @test_vpcmpultw_v16i1
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -41006,11 +40002,15 @@ define zeroext i64 @test_vpcmpultw_v16i1
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -41047,12 +40047,12 @@ define zeroext i64 @test_vpcmpultw_v16i1
 ; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1496:
+; NoVLX-NEXT:  .Lcfi1246:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1497:
+; NoVLX-NEXT:  .Lcfi1247:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1498:
+; NoVLX-NEXT:  .Lcfi1248:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -41061,27 +40061,23 @@ define zeroext i64 @test_vpcmpultw_v16i1
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1499:
+; NoVLX-NEXT:  .Lcfi1249:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1500:
+; NoVLX-NEXT:  .Lcfi1250:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1501:
+; NoVLX-NEXT:  .Lcfi1251:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1502:
+; NoVLX-NEXT:  .Lcfi1252:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1503:
+; NoVLX-NEXT:  .Lcfi1253:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT:    vpxor (%rdi), %ymm1, %ymm2
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpxor (%rdi), %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -41124,11 +40120,11 @@ define zeroext i64 @test_vpcmpultw_v16i1
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -41140,11 +40136,15 @@ define zeroext i64 @test_vpcmpultw_v16i1
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -41183,12 +40183,12 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1504:
+; NoVLX-NEXT:  .Lcfi1254:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1505:
+; NoVLX-NEXT:  .Lcfi1255:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1506:
+; NoVLX-NEXT:  .Lcfi1256:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -41197,15 +40197,15 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1507:
+; NoVLX-NEXT:  .Lcfi1257:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1508:
+; NoVLX-NEXT:  .Lcfi1258:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1509:
+; NoVLX-NEXT:  .Lcfi1259:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1510:
+; NoVLX-NEXT:  .Lcfi1260:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1511:
+; NoVLX-NEXT:  .Lcfi1261:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
 ; NoVLX-NEXT:    vpxor %ymm2, %ymm0, %ymm0
@@ -41215,10 +40215,6 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -41261,11 +40257,11 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -41277,11 +40273,15 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -41321,12 +40321,12 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask_mem:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1512:
+; NoVLX-NEXT:  .Lcfi1262:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1513:
+; NoVLX-NEXT:  .Lcfi1263:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1514:
+; NoVLX-NEXT:  .Lcfi1264:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    pushq %r15
 ; NoVLX-NEXT:    pushq %r14
@@ -41335,28 +40335,24 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-NEXT:    pushq %rbx
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:  .Lcfi1515:
+; NoVLX-NEXT:  .Lcfi1265:
 ; NoVLX-NEXT:    .cfi_offset %rbx, -56
-; NoVLX-NEXT:  .Lcfi1516:
+; NoVLX-NEXT:  .Lcfi1266:
 ; NoVLX-NEXT:    .cfi_offset %r12, -48
-; NoVLX-NEXT:  .Lcfi1517:
+; NoVLX-NEXT:  .Lcfi1267:
 ; NoVLX-NEXT:    .cfi_offset %r13, -40
-; NoVLX-NEXT:  .Lcfi1518:
+; NoVLX-NEXT:  .Lcfi1268:
 ; NoVLX-NEXT:    .cfi_offset %r14, -32
-; NoVLX-NEXT:  .Lcfi1519:
+; NoVLX-NEXT:  .Lcfi1269:
 ; NoVLX-NEXT:    .cfi_offset %r15, -24
 ; NoVLX-NEXT:    vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
+; NoVLX-NEXT:    vpxor (%rsi), %ymm1, %ymm2
 ; NoVLX-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT:    vpxor (%rsi), %ymm1, %ymm1
-; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm0
 ; NoVLX-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    kmovw %edi, %k1
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0 {%k1}
-; NoVLX-NEXT:    kxorw %k0, %k0, %k1
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
-; NoVLX-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    kshiftlw $14, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    kmovw %k1, %r8d
@@ -41399,11 +40395,11 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-NEXT:    kshiftlw $2, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vmovd %r10d, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %r10d
+; NoVLX-NEXT:    kmovw %k1, %ecx
 ; NoVLX-NEXT:    kshiftlw $1, %k0, %k1
 ; NoVLX-NEXT:    kshiftrw $15, %k1, %k1
 ; NoVLX-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
-; NoVLX-NEXT:    kmovw %k1, %ecx
+; NoVLX-NEXT:    kmovw %k1, %r8d
 ; NoVLX-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
@@ -41415,11 +40411,15 @@ define zeroext i64 @test_masked_vpcmpult
 ; NoVLX-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kshiftrw $15, %k0, %k0
-; NoVLX-NEXT:    vpinsrb $13, %r10d, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    kmovw %k0, %eax
+; NoVLX-NEXT:    kxorw %k0, %k0, %k0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; NoVLX-NEXT:    vpinsrb $14, %r8d, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; NoVLX-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; NoVLX-NEXT:    vpslld $31, %zmm0, %zmm0
 ; NoVLX-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -41460,62 +40460,58 @@ define zeroext i64 @test_vpcmpultw_v32i1
 ; NoVLX-LABEL: test_vpcmpultw_v32i1_v64i1_mask:
 ; NoVLX:       # BB#0: # %entry
 ; NoVLX-NEXT:    pushq %rbp
-; NoVLX-NEXT:  .Lcfi1520:
+; NoVLX-NEXT:  .Lcfi1270:
 ; NoVLX-NEXT:    .cfi_def_cfa_offset 16
-; NoVLX-NEXT:  .Lcfi1521:
+; NoVLX-NEXT:  .Lcfi1271:
 ; NoVLX-NEXT:    .cfi_offset %rbp, -16
 ; NoVLX-NEXT:    movq %rsp, %rbp
-; NoVLX-NEXT:  .Lcfi1522:
+; NoVLX-NEXT:  .Lcfi1272:
 ; NoVLX-NEXT:    .cfi_def_cfa_register %rbp
 ; NoVLX-NEXT:    andq $-32, %rsp
 ; NoVLX-NEXT:    subq $64, %rsp
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm3
-; NoVLX-NEXT:    vmovq %xmm3, %rax
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
+; NoVLX-NEXT:    vmovq %xmm2, %rax
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    movq %rax, %rdx
-; NoVLX-NEXT:    vmovd %eax, %xmm2
+; NoVLX-NEXT:    vmovd %eax, %xmm3
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm5
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm8
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm4
-; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm6
-; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm7
-; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    shrq $32, %rdx
-; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm3, %rax
+; NoVLX-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm8
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm5
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vmovd %ecx, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT:    vmovq %xmm2, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm0, %xmm4
+; NoVLX-NEXT:    vmovq %xmm4, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm3, %xmm9
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm5
@@ -41523,79 +40519,72 @@ define zeroext i64 @test_vpcmpultw_v32i1
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
-; NoVLX-NEXT:    vpextrq $1, %xmm2, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm4
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm4, %xmm4
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm7, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm4, %xmm4
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm0, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm5
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm4, %xmm4
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm5
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm5, %xmm5
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm5, %xmm5
+; NoVLX-NEXT:    vpextrq $1, %xmm0, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm5, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vextracti32x4 $1, %zmm1, %xmm6
 ; NoVLX-NEXT:    vmovq %xmm6, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm7
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm5
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vmovd %ecx, %xmm0
+; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
 ; NoVLX-NEXT:    vpextrq $1, %xmm6, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm0, %xmm0
 ; NoVLX-NEXT:    movq %rax, %rcx
-; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm1, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm6
-; NoVLX-NEXT:    movl %ecx, %eax
-; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm2
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    movq %rcx, %rax
-; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
-; NoVLX-NEXT:    shrq $48, %rcx
-; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm1
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm0, %xmm0
+; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; NoVLX-NEXT:    vmovq %xmm1, %rax
 ; NoVLX-NEXT:    movl %eax, %ecx
 ; NoVLX-NEXT:    shrl $16, %ecx
-; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT:    vmovd %eax, %xmm6
+; NoVLX-NEXT:    vpinsrw $1, %ecx, %xmm6, %xmm6
 ; NoVLX-NEXT:    movq %rax, %rcx
 ; NoVLX-NEXT:    shrq $32, %rcx
-; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm1, %xmm1
-; NoVLX-NEXT:    vmovq %xmm4, %rcx
+; NoVLX-NEXT:    vpinsrw $2, %ecx, %xmm6, %xmm6
+; NoVLX-NEXT:    vextracti32x4 $3, %zmm1, %xmm7
+; NoVLX-NEXT:    vmovq %xmm7, %rcx
 ; NoVLX-NEXT:    shrq $48, %rax
-; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; NoVLX-NEXT:    vpinsrw $3, %eax, %xmm6, %xmm6
 ; NoVLX-NEXT:    movl %ecx, %eax
 ; NoVLX-NEXT:    shrl $16, %eax
 ; NoVLX-NEXT:    vmovd %ecx, %xmm2
@@ -41603,7 +40592,7 @@ define zeroext i64 @test_vpcmpultw_v32i1
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
 ; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm2, %xmm2
-; NoVLX-NEXT:    vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT:    vpextrq $1, %xmm7, %rax
 ; NoVLX-NEXT:    shrq $48, %rcx
 ; NoVLX-NEXT:    vpinsrw $3, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movl %eax, %ecx
@@ -41611,111 +40600,122 @@ define zeroext i64 @test_vpcmpultw_v32i1
 ; NoVLX-NEXT:    vpinsrw $4, %eax, %xmm2, %xmm2
 ; NoVLX-NEXT:    vpinsrw $5, %ecx, %xmm2, %xmm2
 ; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    shrq $32, %rcx
 ; NoVLX-NEXT:    vpinsrw $6, %ecx, %xmm2, %xmm2
-; NoVLX-NEXT:    vmovq %xmm8, %rcx
-; NoVLX-NEXT:    shrq $48, %rax
 ; NoVLX-NEXT:    vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT:    vextracti32x4 $2, %zmm1, %xmm7
+; NoVLX-NEXT:    vmovq %xmm7, %rax
+; NoVLX-NEXT:    movl %eax, %ecx
+; NoVLX-NEXT:    shrl $16, %ecx
+; NoVLX-NEXT:    vmovd %eax, %xmm3
+; NoVLX-NEXT:    vpinsrw $1, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    movq %rax, %rcx
+; NoVLX-NEXT:    shrq $32, %rcx
+; NoVLX-NEXT:    vpinsrw $2, %ecx, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm7, %rcx
+; NoVLX-NEXT:    shrq $48, %rax
+; NoVLX-NEXT:    vpinsrw $3, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movl %ecx, %eax
+; NoVLX-NEXT:    vpinsrw $4, %ecx, %xmm3, %xmm3
 ; NoVLX-NEXT:    shrl $16, %eax
-; NoVLX-NEXT:    vmovd %ecx, %xmm4
-; NoVLX-NEXT:    vpinsrw $1, %eax, %xmm4, %xmm4
+; NoVLX-NEXT:    vpinsrw $5, %eax, %xmm3, %xmm3
 ; NoVLX-NEXT:    movq %rcx, %rax
 ; NoVLX-NEXT:    shrq $32, %rax
-; NoVLX-NEXT:    vpinsrw $2, %eax, %xmm4, %xmm4
-; NoVLX-NEXT:    vpextrq $1, %xmm8, %rax
-; NoVLX-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
-; NoVLX-NEXT:    vinserti128 $1, %xmm5, %ymm7, %ymm3
-; NoVLX-NEXT:    vinserti128 $1, %xmm6, %ymm1, %ymm1
+; NoVLX-NEXT:    vpinsrw $6, %eax, %xmm3, %xmm3
+; NoVLX-NEXT:    vpextrq $1, %xmm1, %rax
+; NoVLX-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm4

[... 8716 lines stripped ...]



More information about the llvm-commits mailing list