[llvm] r314306 - [X86][SKX][KNL] Updated regression tests to use -mattr instead of -mcpu flag.NFC.

Gadi Haber via llvm-commits llvm-commits at lists.llvm.org
Wed Sep 27 07:44:15 PDT 2017


Author: gadi.haber
Date: Wed Sep 27 07:44:15 2017
New Revision: 314306

URL: http://llvm.org/viewvc/llvm-project?rev=314306&view=rev
Log:
[X86][SKX][KNL] Updated regression tests to use -mattr instead of -mcpu flag.NFC.

NFC.
 Updated 8 regression tests to use -mattr instead of -mcpu flag as follows:
 -mcpu=knl --> -mattr=+avx512f
 -mcpu=skx --> -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq

The updates are as part of the preparation of a large commit to add all instruction scheduling for the SKX target.

Reviewers: delena, zvi, RKSimon
Differential Revision: https://reviews.llvm.org/D38222

Change-Id: I2381c9b5bb75ecacfca017243c22d054f6eddd14

Modified:
    llvm/trunk/test/CodeGen/X86/avx512-cvt.ll
    llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
    llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
    llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll
    llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
    llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
    llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
    llvm/trunk/test/CodeGen/X86/sse42-intrinsics-x86.ll

Modified: llvm/trunk/test/CodeGen/X86/avx512-cvt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-cvt.ll?rev=314306&r1=314305&r2=314306&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-cvt.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-cvt.ll Wed Sep 27 07:44:15 2017
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl  | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx  | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl  | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLNOBW --check-prefix=AVX512VL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512dq  | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NOVL --check-prefix=DQ --check-prefix=AVX512DQ
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw  | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=AVX512BW
@@ -143,46 +143,29 @@ define <2 x float> @sltof2f32(<2 x i64>
 }
 
 define <4 x float> @slto4f32_mem(<4 x i64>* %a) {
-; KNL-LABEL: slto4f32_mem:
-; KNL:       # BB#0:
-; KNL-NEXT:    vmovdqu (%rdi), %ymm0
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
-; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
-; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; KNL-NEXT:    retq
+; NODQ-LABEL: slto4f32_mem:
+; NODQ:       # BB#0:
+; NODQ-NEXT:    vmovdqu (%rdi), %ymm0
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; NODQ-NEXT:    vzeroupper
+; NODQ-NEXT:    retq
 ;
 ; VLDQ-LABEL: slto4f32_mem:
 ; VLDQ:       # BB#0:
 ; VLDQ-NEXT:    vcvtqq2psy (%rdi), %xmm0
 ; VLDQ-NEXT:    retq
 ;
-; VLNODQ-LABEL: slto4f32_mem:
-; VLNODQ:       # BB#0:
-; VLNODQ-NEXT:    vmovdqu (%rdi), %ymm0
-; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; VLNODQ-NEXT:    vmovq %xmm0, %rax
-; VLNODQ-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; VLNODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; VLNODQ-NEXT:    vmovq %xmm0, %rax
-; VLNODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
-; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
-; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; VLNODQ-NEXT:    vzeroupper
-; VLNODQ-NEXT:    retq
-;
 ; AVX512DQ-LABEL: slto4f32_mem:
 ; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vmovups (%rdi), %ymm0
@@ -190,24 +173,6 @@ define <4 x float> @slto4f32_mem(<4 x i6
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
-;
-; AVX512BW-LABEL: slto4f32_mem:
-; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vmovdqu (%rdi), %ymm0
-; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; AVX512BW-NEXT:    vmovq %xmm0, %rax
-; AVX512BW-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-NEXT:    vmovq %xmm0, %rax
-; AVX512BW-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
-; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
-; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
   %a1 = load <4 x i64>, <4 x i64>* %a, align 8
   %b = sitofp <4 x i64> %a1 to <4 x float>
   ret <4 x float>%b
@@ -282,21 +247,22 @@ define <4 x i64> @f32to4sl(<4 x float> %
 }
 
 define <4 x float> @slto4f32(<4 x i64> %a) {
-; KNL-LABEL: slto4f32:
-; KNL:       # BB#0:
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
-; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
-; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; KNL-NEXT:    retq
+; NODQ-LABEL: slto4f32:
+; NODQ:       # BB#0:
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; NODQ-NEXT:    vzeroupper
+; NODQ-NEXT:    retq
 ;
 ; VLDQ-LABEL: slto4f32:
 ; VLDQ:       # BB#0:
@@ -304,23 +270,6 @@ define <4 x float> @slto4f32(<4 x i64> %
 ; VLDQ-NEXT:    vzeroupper
 ; VLDQ-NEXT:    retq
 ;
-; VLNODQ-LABEL: slto4f32:
-; VLNODQ:       # BB#0:
-; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; VLNODQ-NEXT:    vmovq %xmm0, %rax
-; VLNODQ-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; VLNODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; VLNODQ-NEXT:    vmovq %xmm0, %rax
-; VLNODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
-; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
-; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; VLNODQ-NEXT:    vzeroupper
-; VLNODQ-NEXT:    retq
-;
 ; AVX512DQ-LABEL: slto4f32:
 ; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
@@ -328,43 +277,27 @@ define <4 x float> @slto4f32(<4 x i64> %
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
-;
-; AVX512BW-LABEL: slto4f32:
-; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; AVX512BW-NEXT:    vmovq %xmm0, %rax
-; AVX512BW-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-NEXT:    vmovq %xmm0, %rax
-; AVX512BW-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
-; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
-; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
   %b = sitofp <4 x i64> %a to <4 x float>
   ret <4 x float> %b
 }
 
 define <4 x float> @ulto4f32(<4 x i64> %a) {
-; KNL-LABEL: ulto4f32:
-; KNL:       # BB#0:
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
-; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm2
-; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm0
-; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; KNL-NEXT:    retq
+; NODQ-LABEL: ulto4f32:
+; NODQ:       # BB#0:
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm0
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; NODQ-NEXT:    vzeroupper
+; NODQ-NEXT:    retq
 ;
 ; VLDQ-LABEL: ulto4f32:
 ; VLDQ:       # BB#0:
@@ -372,23 +305,6 @@ define <4 x float> @ulto4f32(<4 x i64> %
 ; VLDQ-NEXT:    vzeroupper
 ; VLDQ-NEXT:    retq
 ;
-; VLNODQ-LABEL: ulto4f32:
-; VLNODQ:       # BB#0:
-; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
-; VLNODQ-NEXT:    vmovq %xmm0, %rax
-; VLNODQ-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
-; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; VLNODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; VLNODQ-NEXT:    vmovq %xmm0, %rax
-; VLNODQ-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm2
-; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm0
-; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; VLNODQ-NEXT:    vzeroupper
-; VLNODQ-NEXT:    retq
-;
 ; AVX512DQ-LABEL: ulto4f32:
 ; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
@@ -396,23 +312,6 @@ define <4 x float> @ulto4f32(<4 x i64> %
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
-;
-; AVX512BW-LABEL: ulto4f32:
-; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
-; AVX512BW-NEXT:    vmovq %xmm0, %rax
-; AVX512BW-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-NEXT:    vmovq %xmm0, %rax
-; AVX512BW-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm2
-; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm0
-; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
   %b = uitofp <4 x i64> %a to <4 x float>
   ret <4 x float> %b
 }
@@ -457,179 +356,67 @@ define <8 x double> @ulto8f64(<8 x i64>
 }
 
 define <16 x double> @ulto16f64(<16 x i64> %a) {
-; KNL-LABEL: ulto16f64:
-; KNL:       # BB#0:
-; KNL-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; KNL-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm3
-; KNL-NEXT:    vmovq %xmm2, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm2
-; KNL-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
-; KNL-NEXT:    vpextrq $1, %xmm4, %rax
-; KNL-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; KNL-NEXT:    vmovq %xmm4, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; KNL-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; KNL-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; KNL-NEXT:    vpextrq $1, %xmm3, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; KNL-NEXT:    vmovq %xmm3, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm0
-; KNL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; KNL-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; KNL-NEXT:    vextracti32x4 $3, %zmm1, %xmm3
-; KNL-NEXT:    vpextrq $1, %xmm3, %rax
-; KNL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm2
-; KNL-NEXT:    vmovq %xmm3, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; KNL-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; KNL-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
-; KNL-NEXT:    vpextrq $1, %xmm3, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; KNL-NEXT:    vmovq %xmm3, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; KNL-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; KNL-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; KNL-NEXT:    vpextrq $1, %xmm3, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; KNL-NEXT:    vmovq %xmm3, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; KNL-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL-NEXT:    vpextrq $1, %xmm1, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; KNL-NEXT:    vmovq %xmm1, %rax
-; KNL-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm1
-; KNL-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; KNL-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; KNL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
-; KNL-NEXT:    retq
+; NODQ-LABEL: ulto16f64:
+; NODQ:       # BB#0:
+; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm3
+; NODQ-NEXT:    vmovq %xmm2, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
+; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm4
+; NODQ-NEXT:    vmovq %xmm3, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vmovq %xmm3, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm0
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; NODQ-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; NODQ-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
+; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vmovq %xmm2, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm2
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
+; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vmovq %xmm3, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; NODQ-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vmovq %xmm3, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vmovq %xmm1, %rax
+; NODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm1
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; NODQ-NEXT:    vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
+; NODQ-NEXT:    retq
 ;
 ; DQ-LABEL: ulto16f64:
 ; DQ:       # BB#0:
 ; DQ-NEXT:    vcvtuqq2pd %zmm0, %zmm0
 ; DQ-NEXT:    vcvtuqq2pd %zmm1, %zmm1
 ; DQ-NEXT:    retq
-;
-; VLNODQ-LABEL: ulto16f64:
-; VLNODQ:       # BB#0:
-; VLNODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; VLNODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; VLNODQ-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm3
-; VLNODQ-NEXT:    vmovq %xmm2, %rax
-; VLNODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm2
-; VLNODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; VLNODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
-; VLNODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; VLNODQ-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm4
-; VLNODQ-NEXT:    vmovq %xmm3, %rax
-; VLNODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; VLNODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; VLNODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; VLNODQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; VLNODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; VLNODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; VLNODQ-NEXT:    vmovq %xmm3, %rax
-; VLNODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; VLNODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; VLNODQ-NEXT:    vmovq %xmm0, %rax
-; VLNODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm0
-; VLNODQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; VLNODQ-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; VLNODQ-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; VLNODQ-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
-; VLNODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; VLNODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; VLNODQ-NEXT:    vmovq %xmm2, %rax
-; VLNODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm2
-; VLNODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; VLNODQ-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
-; VLNODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; VLNODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; VLNODQ-NEXT:    vmovq %xmm3, %rax
-; VLNODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; VLNODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; VLNODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; VLNODQ-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; VLNODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; VLNODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; VLNODQ-NEXT:    vmovq %xmm3, %rax
-; VLNODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; VLNODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; VLNODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; VLNODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; VLNODQ-NEXT:    vmovq %xmm1, %rax
-; VLNODQ-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm1
-; VLNODQ-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; VLNODQ-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; VLNODQ-NEXT:    vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
-; VLNODQ-NEXT:    retq
-;
-; AVX512BW-LABEL: ulto16f64:
-; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX512BW-NEXT:    vcvtusi2sdq %rax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vmovq %xmm2, %rax
-; AVX512BW-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm2
-; AVX512BW-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX512BW-NEXT:    vcvtusi2sdq %rax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vmovq %xmm3, %rax
-; AVX512BW-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; AVX512BW-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX512BW-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX512BW-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX512BW-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; AVX512BW-NEXT:    vmovq %xmm3, %rax
-; AVX512BW-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; AVX512BW-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; AVX512BW-NEXT:    vmovq %xmm0, %rax
-; AVX512BW-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm0
-; AVX512BW-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; AVX512BW-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
-; AVX512BW-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX512BW-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; AVX512BW-NEXT:    vmovq %xmm2, %rax
-; AVX512BW-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm2
-; AVX512BW-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX512BW-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; AVX512BW-NEXT:    vmovq %xmm3, %rax
-; AVX512BW-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; AVX512BW-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX512BW-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX512BW-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX512BW-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; AVX512BW-NEXT:    vmovq %xmm3, %rax
-; AVX512BW-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm3
-; AVX512BW-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX512BW-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512BW-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm4
-; AVX512BW-NEXT:    vmovq %xmm1, %rax
-; AVX512BW-NEXT:    vcvtusi2sdq %rax, %xmm5, %xmm1
-; AVX512BW-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; AVX512BW-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX512BW-NEXT:    vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512BW-NEXT:    retq
   %b = uitofp <16 x i64> %a to <16 x double>
   ret <16 x double> %b
 }
@@ -653,18 +440,12 @@ define <16 x i32> @f32to16ui(<16 x float
 }
 
 define <16 x i8> @f32to16uc(<16 x float> %f) {
-; KNL-LABEL: f32to16uc:
-; KNL:       # BB#0:
-; KNL-NEXT:    vcvttps2udq %zmm0, %zmm0
-; KNL-NEXT:    vpmovdb %zmm0, %xmm0
-; KNL-NEXT:    retq
-;
-; AVX512-LABEL: f32to16uc:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vcvttps2udq %zmm0, %zmm0
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; ALL-LABEL: f32to16uc:
+; ALL:       # BB#0:
+; ALL-NEXT:    vcvttps2udq %zmm0, %zmm0
+; ALL-NEXT:    vpmovdb %zmm0, %xmm0
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
   %res = fptoui <16 x float> %f to <16 x i8>
   ret <16 x i8> %res
 }
@@ -696,33 +477,18 @@ define <8 x i32> @f32to8ui(<8 x float> %
 }
 
 define <4 x i32> @f32to4ui(<4 x float> %a) nounwind {
-; KNL-LABEL: f32to4ui:
-; KNL:       # BB#0:
-; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; KNL-NEXT:    vcvttps2udq %zmm0, %zmm0
-; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; KNL-NEXT:    retq
+; NOVL-LABEL: f32to4ui:
+; NOVL:       # BB#0:
+; NOVL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; NOVL-NEXT:    vcvttps2udq %zmm0, %zmm0
+; NOVL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
 ;
 ; VL-LABEL: f32to4ui:
 ; VL:       # BB#0:
 ; VL-NEXT:    vcvttps2udq %xmm0, %xmm0
 ; VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: f32to4ui:
-; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512DQ-NEXT:    vcvttps2udq %zmm0, %zmm0
-; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512BW-LABEL: f32to4ui:
-; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512BW-NEXT:    vcvttps2udq %zmm0, %zmm0
-; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
   %b = fptoui <4 x float> %a to <4 x i32>
   ret <4 x i32> %b
 }
@@ -737,12 +503,13 @@ define <8 x i32> @f64to8ui(<8 x double>
 }
 
 define <8 x i16> @f64to8us(<8 x double> %f) {
-; KNL-LABEL: f64to8us:
-; KNL:       # BB#0:
-; KNL-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; KNL-NEXT:    vpmovdw %zmm0, %ymm0
-; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; KNL-NEXT:    retq
+; NOVL-LABEL: f64to8us:
+; NOVL:       # BB#0:
+; NOVL-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; NOVL-NEXT:    vpmovdw %zmm0, %ymm0
+; NOVL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
 ;
 ; VL-LABEL: f64to8us:
 ; VL:       # BB#0:
@@ -750,33 +517,18 @@ define <8 x i16> @f64to8us(<8 x double>
 ; VL-NEXT:    vpmovdw %ymm0, %xmm0
 ; VL-NEXT:    vzeroupper
 ; VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: f64to8us:
-; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512BW-LABEL: f64to8us:
-; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
   %res = fptoui <8 x double> %f to <8 x i16>
   ret <8 x i16> %res
 }
 
 define <8 x i8> @f64to8uc(<8 x double> %f) {
-; KNL-LABEL: f64to8uc:
-; KNL:       # BB#0:
-; KNL-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; KNL-NEXT:    vpmovdw %zmm0, %ymm0
-; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; KNL-NEXT:    retq
+; NOVL-LABEL: f64to8uc:
+; NOVL:       # BB#0:
+; NOVL-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; NOVL-NEXT:    vpmovdw %zmm0, %ymm0
+; NOVL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
 ;
 ; VL-LABEL: f64to8uc:
 ; VL:       # BB#0:
@@ -784,55 +536,24 @@ define <8 x i8> @f64to8uc(<8 x double> %
 ; VL-NEXT:    vpmovdw %ymm0, %xmm0
 ; VL-NEXT:    vzeroupper
 ; VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: f64to8uc:
-; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512BW-LABEL: f64to8uc:
-; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vcvttpd2dq %zmm0, %ymm0
-; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
   %res = fptoui <8 x double> %f to <8 x i8>
   ret <8 x i8> %res
 }
 
 define <4 x i32> @f64to4ui(<4 x double> %a) nounwind {
-; KNL-LABEL: f64to4ui:
-; KNL:       # BB#0:
-; KNL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; KNL-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; KNL-NEXT:    retq
+; NOVL-LABEL: f64to4ui:
+; NOVL:       # BB#0:
+; NOVL-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NOVL-NEXT:    vcvttpd2udq %zmm0, %ymm0
+; NOVL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
 ;
 ; VL-LABEL: f64to4ui:
 ; VL:       # BB#0:
 ; VL-NEXT:    vcvttpd2udq %ymm0, %xmm0
 ; VL-NEXT:    vzeroupper
 ; VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: f64to4ui:
-; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; AVX512DQ-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512BW-LABEL: f64to4ui:
-; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; AVX512BW-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
   %b = fptoui <4 x double> %a to <4 x i32>
   ret <4 x i32> %b
 }
@@ -926,16 +647,11 @@ define <8 x i32> @f64to8si(<8 x double>
 }
 
 define <4 x i32> @f64to4si(<4 x double> %a) {
-; KNL-LABEL: f64to4si:
-; KNL:       # BB#0:
-; KNL-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; KNL-NEXT:    retq
-;
-; AVX512-LABEL: f64to4si:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; ALL-LABEL: f64to4si:
+; ALL:       # BB#0:
+; ALL-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
   %b = fptosi <4 x double> %a to <4 x i32>
   ret <4 x i32> %b
 }
@@ -952,28 +668,24 @@ define <16 x float> @f64to16f32(<16 x do
 }
 
 define <4 x float> @f64to4f32(<4 x double> %b) {
-; KNL-LABEL: f64to4f32:
-; KNL:       # BB#0:
-; KNL-NEXT:    vcvtpd2ps %ymm0, %xmm0
-; KNL-NEXT:    retq
-;
-; AVX512-LABEL: f64to4f32:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vcvtpd2ps %ymm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; ALL-LABEL: f64to4f32:
+; ALL:       # BB#0:
+; ALL-NEXT:    vcvtpd2ps %ymm0, %xmm0
+; ALL-NEXT:    vzeroupper
+; ALL-NEXT:    retq
   %a = fptrunc <4 x double> %b to <4 x float>
   ret <4 x float> %a
 }
 
 define <4 x float> @f64to4f32_mask(<4 x double> %b, <4 x i1> %mask) {
-; KNL-LABEL: f64to4f32_mask:
-; KNL:       # BB#0:
-; KNL-NEXT:    vpslld $31, %xmm1, %xmm1
-; KNL-NEXT:    vpsrad $31, %xmm1, %xmm1
-; KNL-NEXT:    vcvtpd2ps %ymm0, %xmm0
-; KNL-NEXT:    vpand %xmm0, %xmm1, %xmm0
-; KNL-NEXT:    retq
+; NOVL-LABEL: f64to4f32_mask:
+; NOVL:       # BB#0:
+; NOVL-NEXT:    vpslld $31, %xmm1, %xmm1
+; NOVL-NEXT:    vpsrad $31, %xmm1, %xmm1
+; NOVL-NEXT:    vcvtpd2ps %ymm0, %xmm0
+; NOVL-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
 ;
 ; VL-LABEL: f64to4f32_mask:
 ; VL:       # BB#0:
@@ -982,24 +694,6 @@ define <4 x float> @f64to4f32_mask(<4 x
 ; VL-NEXT:    vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
 ; VL-NEXT:    vzeroupper
 ; VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: f64to4f32_mask:
-; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    vpslld $31, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpsrad $31, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vcvtpd2ps %ymm0, %xmm0
-; AVX512DQ-NEXT:    vpand %xmm0, %xmm1, %xmm0
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512BW-LABEL: f64to4f32_mask:
-; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vpslld $31, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpsrad $31, %xmm1, %xmm1
-; AVX512BW-NEXT:    vcvtpd2ps %ymm0, %xmm0
-; AVX512BW-NEXT:    vpand %xmm0, %xmm1, %xmm0
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
   %a = fptrunc <4 x double> %b to <4 x float>
   %c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer
   ret <4 x float> %c
@@ -1180,100 +874,40 @@ define <16 x double> @uito16f64(<16 x i3
 }
 
 define <8 x float> @slto8f32(<8 x i64> %a) {
-; KNL-LABEL: slto8f32:
-; KNL:       # BB#0:
-; KNL-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; KNL-NEXT:    vpextrq $1, %xmm1, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; KNL-NEXT:    vmovq %xmm1, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm1
-; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; KNL-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; KNL-NEXT:    vmovq %xmm2, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm3
-; KNL-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm2
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm2
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm3
-; KNL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm3
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm3[0],xmm2[3]
-; KNL-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm2
-; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
-; KNL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT:    retq
+; NODQ-LABEL: slto8f32:
+; NODQ:       # BB#0:
+; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vmovq %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm1
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NODQ-NEXT:    vmovq %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm3
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm0
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; NODQ-NEXT:    retq
 ;
 ; DQ-LABEL: slto8f32:
 ; DQ:       # BB#0:
 ; DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
 ; DQ-NEXT:    retq
-;
-; VLNODQ-LABEL: slto8f32:
-; VLNODQ:       # BB#0:
-; VLNODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; VLNODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; VLNODQ-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; VLNODQ-NEXT:    vmovq %xmm1, %rax
-; VLNODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm1
-; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; VLNODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; VLNODQ-NEXT:    vmovq %xmm2, %rax
-; VLNODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm3
-; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; VLNODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; VLNODQ-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm2
-; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm2
-; VLNODQ-NEXT:    vmovq %xmm0, %rax
-; VLNODQ-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm3
-; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; VLNODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; VLNODQ-NEXT:    vmovq %xmm0, %rax
-; VLNODQ-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm3
-; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
-; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm0
-; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; VLNODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; VLNODQ-NEXT:    retq
-;
-; AVX512BW-LABEL: slto8f32:
-; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; AVX512BW-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512BW-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vmovq %xmm1, %rax
-; AVX512BW-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm1
-; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; AVX512BW-NEXT:    vmovq %xmm2, %rax
-; AVX512BW-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; AVX512BW-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX512BW-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm2
-; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm2
-; AVX512BW-NEXT:    vmovq %xmm0, %rax
-; AVX512BW-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm3
-; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-NEXT:    vmovq %xmm0, %rax
-; AVX512BW-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm3
-; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
-; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT:    vcvtsi2ssq %rax, %xmm4, %xmm0
-; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; AVX512BW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-NEXT:    retq
   %b = sitofp <8 x i64> %a to <8 x float>
   ret <8 x float> %b
 }
@@ -1386,278 +1020,106 @@ define <8 x double> @slto8f64(<8 x i64>
 }
 
 define <16 x double> @slto16f64(<16 x i64> %a) {
-; KNL-LABEL: slto16f64:
-; KNL:       # BB#0:
-; KNL-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; KNL-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm3
-; KNL-NEXT:    vmovq %xmm2, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm2
-; KNL-NEXT:    vextracti32x4 $2, %zmm0, %xmm4
-; KNL-NEXT:    vpextrq $1, %xmm4, %rax
-; KNL-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
-; KNL-NEXT:    vmovq %xmm4, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
-; KNL-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; KNL-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; KNL-NEXT:    vpextrq $1, %xmm3, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
-; KNL-NEXT:    vmovq %xmm3, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm0
-; KNL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; KNL-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; KNL-NEXT:    vextracti32x4 $3, %zmm1, %xmm3
-; KNL-NEXT:    vpextrq $1, %xmm3, %rax
-; KNL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm2
-; KNL-NEXT:    vmovq %xmm3, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
-; KNL-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; KNL-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
-; KNL-NEXT:    vpextrq $1, %xmm3, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
-; KNL-NEXT:    vmovq %xmm3, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
-; KNL-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; KNL-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; KNL-NEXT:    vpextrq $1, %xmm3, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
-; KNL-NEXT:    vmovq %xmm3, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
-; KNL-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL-NEXT:    vpextrq $1, %xmm1, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
-; KNL-NEXT:    vmovq %xmm1, %rax
-; KNL-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm1
-; KNL-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; KNL-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; KNL-NEXT:    vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
-; KNL-NEXT:    retq
+; NODQ-LABEL: slto16f64:
+; NODQ:       # BB#0:
+; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm3
+; NODQ-NEXT:    vmovq %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
+; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm4
+; NODQ-NEXT:    vmovq %xmm3, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vmovq %xmm3, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm0
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; NODQ-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; NODQ-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
+; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vmovq %xmm2, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm2
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
+; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vmovq %xmm3, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; NODQ-NEXT:    vextracti128 $1, %ymm1, %xmm3
+; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vmovq %xmm3, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT:    vmovq %xmm1, %rax
+; NODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm1
+; NODQ-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; NODQ-NEXT:    vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
+; NODQ-NEXT:    retq
 ;
 ; DQ-LABEL: slto16f64:
 ; DQ:       # BB#0:
 ; DQ-NEXT:    vcvtqq2pd %zmm0, %zmm0
 ; DQ-NEXT:    vcvtqq2pd %zmm1, %zmm1
 ; DQ-NEXT:    retq
-;
-; VLNODQ-LABEL: slto16f64:
-; VLNODQ:       # BB#0:
-; VLNODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; VLNODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; VLNODQ-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm3
-; VLNODQ-NEXT:    vmovq %xmm2, %rax
-; VLNODQ-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm2
-; VLNODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; VLNODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
-; VLNODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; VLNODQ-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm4
-; VLNODQ-NEXT:    vmovq %xmm3, %rax
-; VLNODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
-; VLNODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; VLNODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; VLNODQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; VLNODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; VLNODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
-; VLNODQ-NEXT:    vmovq %xmm3, %rax
-; VLNODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
-; VLNODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
-; VLNODQ-NEXT:    vmovq %xmm0, %rax
-; VLNODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm0
-; VLNODQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; VLNODQ-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; VLNODQ-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; VLNODQ-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
-; VLNODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; VLNODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
-; VLNODQ-NEXT:    vmovq %xmm2, %rax
-; VLNODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm2
-; VLNODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; VLNODQ-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
-; VLNODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; VLNODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
-; VLNODQ-NEXT:    vmovq %xmm3, %rax
-; VLNODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
-; VLNODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; VLNODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; VLNODQ-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; VLNODQ-NEXT:    vpextrq $1, %xmm3, %rax
-; VLNODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
-; VLNODQ-NEXT:    vmovq %xmm3, %rax
-; VLNODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
-; VLNODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; VLNODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; VLNODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
-; VLNODQ-NEXT:    vmovq %xmm1, %rax
-; VLNODQ-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm1
-; VLNODQ-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; VLNODQ-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; VLNODQ-NEXT:    vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
-; VLNODQ-NEXT:    retq
-;
-; AVX512BW-LABEL: slto16f64:
-; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX512BW-NEXT:    vcvtsi2sdq %rax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vmovq %xmm2, %rax
-; AVX512BW-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm2
-; AVX512BW-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
-; AVX512BW-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX512BW-NEXT:    vcvtsi2sdq %rax, %xmm4, %xmm4
-; AVX512BW-NEXT:    vmovq %xmm3, %rax
-; AVX512BW-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
-; AVX512BW-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX512BW-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX512BW-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX512BW-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
-; AVX512BW-NEXT:    vmovq %xmm3, %rax
-; AVX512BW-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
-; AVX512BW-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
-; AVX512BW-NEXT:    vmovq %xmm0, %rax
-; AVX512BW-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm0
-; AVX512BW-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; AVX512BW-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
-; AVX512BW-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX512BW-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
-; AVX512BW-NEXT:    vmovq %xmm2, %rax
-; AVX512BW-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm2
-; AVX512BW-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
-; AVX512BW-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX512BW-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
-; AVX512BW-NEXT:    vmovq %xmm3, %rax
-; AVX512BW-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
-; AVX512BW-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX512BW-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; AVX512BW-NEXT:    vpextrq $1, %xmm3, %rax
-; AVX512BW-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
-; AVX512BW-NEXT:    vmovq %xmm3, %rax
-; AVX512BW-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm3
-; AVX512BW-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX512BW-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512BW-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm4
-; AVX512BW-NEXT:    vmovq %xmm1, %rax
-; AVX512BW-NEXT:    vcvtsi2sdq %rax, %xmm5, %xmm1
-; AVX512BW-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; AVX512BW-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX512BW-NEXT:    vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512BW-NEXT:    retq
   %b = sitofp <16 x i64> %a to <16 x double>
   ret <16 x double> %b
 }
 
 define <8 x float> @ulto8f32(<8 x i64> %a) {
-; KNL-LABEL: ulto8f32:
-; KNL:       # BB#0:
-; KNL-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; KNL-NEXT:    vpextrq $1, %xmm1, %rax
-; KNL-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
-; KNL-NEXT:    vmovq %xmm1, %rax
-; KNL-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm1
-; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; KNL-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; KNL-NEXT:    vmovq %xmm2, %rax
-; KNL-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm3
-; KNL-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; KNL-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm2
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; KNL-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm2
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm3
-; KNL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm3
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm3[0],xmm2[3]
-; KNL-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm2
-; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
-; KNL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT:    retq
+; NODQ-LABEL: ulto8f32:
+; NODQ:       # BB#0:
+; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT:    vmovq %xmm1, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm1
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
+; NODQ-NEXT:    vmovq %xmm2, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm3
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm2
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT:    vmovq %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm3
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
+; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm0
+; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
+; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; NODQ-NEXT:    retq
 ;
 ; DQ-LABEL: ulto8f32:
 ; DQ:       # BB#0:
 ; DQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
 ; DQ-NEXT:    retq
-;
-; VLNODQ-LABEL: ulto8f32:
-; VLNODQ:       # BB#0:
-; VLNODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; VLNODQ-NEXT:    vpextrq $1, %xmm1, %rax
-; VLNODQ-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
-; VLNODQ-NEXT:    vmovq %xmm1, %rax
-; VLNODQ-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm1
-; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; VLNODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; VLNODQ-NEXT:    vmovq %xmm2, %rax
-; VLNODQ-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm3
-; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; VLNODQ-NEXT:    vpextrq $1, %xmm2, %rax
-; VLNODQ-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm2
-; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm2
-; VLNODQ-NEXT:    vmovq %xmm0, %rax
-; VLNODQ-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm3
-; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; VLNODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; VLNODQ-NEXT:    vmovq %xmm0, %rax
-; VLNODQ-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm3
-; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
-; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm0
-; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; VLNODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; VLNODQ-NEXT:    retq
-;
-; AVX512BW-LABEL: ulto8f32:
-; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; AVX512BW-NEXT:    vpextrq $1, %xmm1, %rax
-; AVX512BW-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
-; AVX512BW-NEXT:    vmovq %xmm1, %rax
-; AVX512BW-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm1
-; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; AVX512BW-NEXT:    vmovq %xmm2, %rax
-; AVX512BW-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm3
-; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; AVX512BW-NEXT:    vpextrq $1, %xmm2, %rax
-; AVX512BW-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm2
-; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm2
-; AVX512BW-NEXT:    vmovq %xmm0, %rax
-; AVX512BW-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm3
-; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-NEXT:    vmovq %xmm0, %rax
-; AVX512BW-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm3
-; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
-; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT:    vcvtusi2ssq %rax, %xmm4, %xmm0
-; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; AVX512BW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-NEXT:    retq
   %b = uitofp <8 x i64> %a to <8 x float>
   ret <8 x float> %b
 }
@@ -1852,33 +1314,18 @@ define <8 x float> @uito8f32(<8 x i32> %
 }
 
 define <4 x float> @uito4f32(<4 x i32> %a) nounwind {
-; KNL-LABEL: uito4f32:
-; KNL:       # BB#0:
-; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; KNL-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; KNL-NEXT:    retq
+; NOVL-LABEL: uito4f32:
+; NOVL:       # BB#0:
+; NOVL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; NOVL-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; NOVL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; NOVL-NEXT:    vzeroupper
+; NOVL-NEXT:    retq
 ;
 ; VL-LABEL: uito4f32:
 ; VL:       # BB#0:
 ; VL-NEXT:    vcvtudq2ps %xmm0, %xmm0
 ; VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: uito4f32:
-; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512DQ-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
-;
-; AVX512BW-LABEL: uito4f32:
-; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512BW-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
   %b = uitofp <4 x i32> %a to <4 x float>
   ret <4 x float> %b
 }
@@ -2510,19 +1957,19 @@ define <4 x double> @ubto4f64(<4 x i32>
 }
 
 define <2 x float> @ubto2f32(<2 x i32> %a) {
-; KNL-LABEL: ubto2f32:
-; KNL:       # BB#0:
-; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; KNL-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
-; KNL-NEXT:    vpextrb $8, %xmm0, %eax
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    vpextrb $0, %xmm0, %ecx
-; KNL-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
-; KNL-NEXT:    andl $1, %ecx
-; KNL-NEXT:    vcvtsi2ssl %ecx, %xmm2, %xmm1
-; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; KNL-NEXT:    retq
+; NOVL-LABEL: ubto2f32:
+; NOVL:       # BB#0:
+; NOVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; NOVL-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; NOVL-NEXT:    vpextrb $8, %xmm0, %eax
+; NOVL-NEXT:    andl $1, %eax
+; NOVL-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm1
+; NOVL-NEXT:    vpextrb $0, %xmm0, %eax
+; NOVL-NEXT:    andl $1, %eax
+; NOVL-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
+; NOVL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; NOVL-NEXT:    retq
 ;
 ; VL-LABEL: ubto2f32:
 ; VL:       # BB#0:
@@ -2532,34 +1979,6 @@ define <2 x float> @ubto2f32(<2 x i32> %
 ; VL-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
 ; VL-NEXT:    vcvtudq2ps %xmm0, %xmm0
 ; VL-NEXT:    retq
-;
-; AVX512DQ-LABEL: ubto2f32:
-; AVX512DQ:       # BB#0:
-; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512DQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
-; AVX512DQ-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm1
-; AVX512DQ-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX512DQ-NEXT:    andl $1, %eax
-; AVX512DQ-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
-; AVX512DQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; AVX512DQ-NEXT:    retq
-;
-; AVX512BW-LABEL: ubto2f32:
-; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512BW-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
-; AVX512BW-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm1
-; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT:    andl $1, %eax
-; AVX512BW-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
-; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; AVX512BW-NEXT:    retq
   %mask = icmp ult <2 x i32> %a, zeroinitializer
   %1 = uitofp <2 x i1> %mask to <2 x float>
   ret <2 x float> %1

Modified: llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll?rev=314306&r1=314305&r2=314306&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll Wed Sep 27 07:44:15 2017
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck --check-prefix=CHECK --check-prefix=KNL %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=CHECK --check-prefix=SKX --check-prefix=SKX_ONLY %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -mattr=avx512vbmi | FileCheck --check-prefix=CHECK --check-prefix=SKX --check-prefix=SKX_VBMI %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck --check-prefix=CHECK --check-prefix=KNL %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck --check-prefix=CHECK --check-prefix=SKX --check-prefix=SKX_ONLY %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+avx512vbmi | FileCheck --check-prefix=CHECK --check-prefix=SKX --check-prefix=SKX_VBMI %s
 
 define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
 ; CHECK-LABEL: test1:
@@ -12,7 +12,6 @@ define <16 x float> @test1(<16 x float>
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
 ; CHECK-NEXT:    vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
   %rrr = load float, float* %br
   %rrr2 = insertelement <16 x float> %x, float %rrr, i32 1
   %rrr3 = insertelement <16 x float> %rrr2, float %y, i32 14
@@ -28,7 +27,6 @@ define <8 x double> @test2(<8 x double>
 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; CHECK-NEXT:    vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
   %rrr = load double, double* %br
   %rrr2 = insertelement <8 x double> %x, double %rrr, i32 1
   %rrr3 = insertelement <8 x double> %rrr2, double %y, i32 6
@@ -42,7 +40,6 @@ define <16 x float> @test3(<16 x float>
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
 ; CHECK-NEXT:    vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
   %eee = extractelement <16 x float> %x, i32 4
   %rrr2 = insertelement <16 x float> %x, float %eee, i32 1
   ret <16 x float> %rrr2
@@ -56,7 +53,6 @@ define <8 x i64> @test4(<8 x i64> %x) no
 ; CHECK-NEXT:    vpinsrq $1, %rax, %xmm0, %xmm1
 ; CHECK-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
   %eee = extractelement <8 x i64> %x, i32 4
   %rrr2 = insertelement <8 x i64> %x, i64 %eee, i32 1
   ret <8 x i64> %rrr2
@@ -67,7 +63,6 @@ define i32 @test5(<4 x float> %x) nounwi
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vextractps $3, %xmm0, %eax
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
   %ef = extractelement <4 x float> %x, i32 3
   %ei = bitcast float %ef to i32
   ret i32 %ei
@@ -78,148 +73,83 @@ define void @test6(<4 x float> %x, float
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vextractps $3, %xmm0, (%rdi)
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
   %ef = extractelement <4 x float> %x, i32 3
   store float %ef, float* %out, align 4
   ret void
 }
 
 define float @test7(<16 x float> %x, i32 %ind) nounwind {
-; KNL-LABEL: test7:
-; KNL:       ## BB#0:
-; KNL-NEXT:    pushq %rbp
-; KNL-NEXT:    movq %rsp, %rbp
-; KNL-NEXT:    andq $-64, %rsp
-; KNL-NEXT:    subq $128, %rsp
-; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT:    vmovaps %zmm0, (%rsp)
-; KNL-NEXT:    andl $15, %edi
-; KNL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; KNL-NEXT:    movq %rbp, %rsp
-; KNL-NEXT:    popq %rbp
-; KNL-NEXT:    retq
-; KNL-NEXT:    ## -- End function
-;
-; SKX-LABEL: test7:
-; SKX:       ## BB#0:
-; SKX-NEXT:    pushq %rbp
-; SKX-NEXT:    movq %rsp, %rbp
-; SKX-NEXT:    andq $-64, %rsp
-; SKX-NEXT:    subq $128, %rsp
-; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT:    vmovaps %zmm0, (%rsp)
-; SKX-NEXT:    andl $15, %edi
-; SKX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SKX-NEXT:    movq %rbp, %rsp
-; SKX-NEXT:    popq %rbp
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-; SKX-NEXT:    ## -- End function
+; CHECK-LABEL: test7:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    movq %rsp, %rbp
+; CHECK-NEXT:    andq $-64, %rsp
+; CHECK-NEXT:    subq $128, %rsp
+; CHECK-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; CHECK-NEXT:    vmovaps %zmm0, (%rsp)
+; CHECK-NEXT:    andl $15, %edi
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movq %rbp, %rsp
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %e = extractelement <16 x float> %x, i32 %ind
   ret float %e
 }
 
 define double @test8(<8 x double> %x, i32 %ind) nounwind {
-; KNL-LABEL: test8:
-; KNL:       ## BB#0:
-; KNL-NEXT:    pushq %rbp
-; KNL-NEXT:    movq %rsp, %rbp
-; KNL-NEXT:    andq $-64, %rsp
-; KNL-NEXT:    subq $128, %rsp
-; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT:    vmovaps %zmm0, (%rsp)
-; KNL-NEXT:    andl $7, %edi
-; KNL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; KNL-NEXT:    movq %rbp, %rsp
-; KNL-NEXT:    popq %rbp
-; KNL-NEXT:    retq
-; KNL-NEXT:    ## -- End function
-;
-; SKX-LABEL: test8:
-; SKX:       ## BB#0:
-; SKX-NEXT:    pushq %rbp
-; SKX-NEXT:    movq %rsp, %rbp
-; SKX-NEXT:    andq $-64, %rsp
-; SKX-NEXT:    subq $128, %rsp
-; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT:    vmovaps %zmm0, (%rsp)
-; SKX-NEXT:    andl $7, %edi
-; SKX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; SKX-NEXT:    movq %rbp, %rsp
-; SKX-NEXT:    popq %rbp
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-; SKX-NEXT:    ## -- End function
+; CHECK-LABEL: test8:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    movq %rsp, %rbp
+; CHECK-NEXT:    andq $-64, %rsp
+; CHECK-NEXT:    subq $128, %rsp
+; CHECK-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; CHECK-NEXT:    vmovaps %zmm0, (%rsp)
+; CHECK-NEXT:    andl $7, %edi
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movq %rbp, %rsp
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %e = extractelement <8 x double> %x, i32 %ind
   ret double %e
 }
 
 define float @test9(<8 x float> %x, i32 %ind) nounwind {
-; KNL-LABEL: test9:
-; KNL:       ## BB#0:
-; KNL-NEXT:    pushq %rbp
-; KNL-NEXT:    movq %rsp, %rbp
-; KNL-NEXT:    andq $-32, %rsp
-; KNL-NEXT:    subq $64, %rsp
-; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT:    vmovaps %ymm0, (%rsp)
-; KNL-NEXT:    andl $7, %edi
-; KNL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; KNL-NEXT:    movq %rbp, %rsp
-; KNL-NEXT:    popq %rbp
-; KNL-NEXT:    retq
-; KNL-NEXT:    ## -- End function
-;
-; SKX-LABEL: test9:
-; SKX:       ## BB#0:
-; SKX-NEXT:    pushq %rbp
-; SKX-NEXT:    movq %rsp, %rbp
-; SKX-NEXT:    andq $-32, %rsp
-; SKX-NEXT:    subq $64, %rsp
-; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT:    vmovaps %ymm0, (%rsp)
-; SKX-NEXT:    andl $7, %edi
-; SKX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SKX-NEXT:    movq %rbp, %rsp
-; SKX-NEXT:    popq %rbp
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-; SKX-NEXT:    ## -- End function
+; CHECK-LABEL: test9:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    movq %rsp, %rbp
+; CHECK-NEXT:    andq $-32, %rsp
+; CHECK-NEXT:    subq $64, %rsp
+; CHECK-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; CHECK-NEXT:    vmovaps %ymm0, (%rsp)
+; CHECK-NEXT:    andl $7, %edi
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movq %rbp, %rsp
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %e = extractelement <8 x float> %x, i32 %ind
   ret float %e
 }
 
 define i32 @test10(<16 x i32> %x, i32 %ind) nounwind {
-; KNL-LABEL: test10:
-; KNL:       ## BB#0:
-; KNL-NEXT:    pushq %rbp
-; KNL-NEXT:    movq %rsp, %rbp
-; KNL-NEXT:    andq $-64, %rsp
-; KNL-NEXT:    subq $128, %rsp
-; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT:    vmovaps %zmm0, (%rsp)
-; KNL-NEXT:    andl $15, %edi
-; KNL-NEXT:    movl (%rsp,%rdi,4), %eax
-; KNL-NEXT:    movq %rbp, %rsp
-; KNL-NEXT:    popq %rbp
-; KNL-NEXT:    retq
-; KNL-NEXT:    ## -- End function
-;
-; SKX-LABEL: test10:
-; SKX:       ## BB#0:
-; SKX-NEXT:    pushq %rbp
-; SKX-NEXT:    movq %rsp, %rbp
-; SKX-NEXT:    andq $-64, %rsp
-; SKX-NEXT:    subq $128, %rsp
-; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT:    vmovaps %zmm0, (%rsp)
-; SKX-NEXT:    andl $15, %edi
-; SKX-NEXT:    movl (%rsp,%rdi,4), %eax
-; SKX-NEXT:    movq %rbp, %rsp
-; SKX-NEXT:    popq %rbp
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-; SKX-NEXT:    ## -- End function
+; CHECK-LABEL: test10:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    movq %rsp, %rbp
+; CHECK-NEXT:    andq $-64, %rsp
+; CHECK-NEXT:    subq $128, %rsp
+; CHECK-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; CHECK-NEXT:    vmovaps %zmm0, (%rsp)
+; CHECK-NEXT:    andl $15, %edi
+; CHECK-NEXT:    movl (%rsp,%rdi,4), %eax
+; CHECK-NEXT:    movq %rbp, %rsp
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %e = extractelement <16 x i32> %x, i32 %ind
   ret i32 %e
 }
@@ -274,6 +204,7 @@ define i64 @test12(<16 x i64>%a, <16 x i
 ; KNL-NEXT:    testb $1, %al
 ; KNL-NEXT:    cmoveq %rsi, %rdi
 ; KNL-NEXT:    movq %rdi, %rax
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test12:
@@ -339,6 +270,7 @@ define i64 @test14(<8 x i64>%a, <8 x i64
 ; KNL-NEXT:    testb $1, %al
 ; KNL-NEXT:    cmoveq %rsi, %rdi
 ; KNL-NEXT:    movq %rdi, %rax
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test14:
@@ -387,6 +319,7 @@ define i16 @test16(i1 *%addr, i16 %a) {
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test16:
@@ -424,6 +357,7 @@ define i8 @test17(i1 *%addr, i8 %a) {
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test17:
@@ -448,20 +382,13 @@ define i8 @test17(i1 *%addr, i8 %a) {
 }
 
 define i64 @extract_v8i64(<8 x i64> %x, i64* %dst) {
-; KNL-LABEL: extract_v8i64:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT:    vpextrq $1, %xmm0, (%rdi)
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: extract_v8i64:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpextrq $1, %xmm0, %rax
-; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT:    vpextrq $1, %xmm0, (%rdi)
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
+; CHECK-LABEL: extract_v8i64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpextrq $1, %xmm0, %rax
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpextrq $1, %xmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %r1 = extractelement <8 x i64> %x, i32 1
   %r2 = extractelement <8 x i64> %x, i32 3
   store i64 %r2, i64* %dst, align 1
@@ -469,20 +396,13 @@ define i64 @extract_v8i64(<8 x i64> %x,
 }
 
 define i64 @extract_v4i64(<4 x i64> %x, i64* %dst) {
-; KNL-LABEL: extract_v4i64:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT:    vpextrq $1, %xmm0, (%rdi)
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: extract_v4i64:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpextrq $1, %xmm0, %rax
-; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT:    vpextrq $1, %xmm0, (%rdi)
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
+; CHECK-LABEL: extract_v4i64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpextrq $1, %xmm0, %rax
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpextrq $1, %xmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %r1 = extractelement <4 x i64> %x, i32 1
   %r2 = extractelement <4 x i64> %x, i32 3
   store i64 %r2, i64* %dst, align 1
@@ -502,20 +422,13 @@ define i64 @extract_v2i64(<2 x i64> %x,
 }
 
 define i32 @extract_v16i32(<16 x i32> %x, i32* %dst) {
-; KNL-LABEL: extract_v16i32:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpextrd $1, %xmm0, %eax
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT:    vpextrd $1, %xmm0, (%rdi)
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: extract_v16i32:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpextrd $1, %xmm0, %eax
-; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT:    vpextrd $1, %xmm0, (%rdi)
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
+; CHECK-LABEL: extract_v16i32:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpextrd $1, %xmm0, %eax
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpextrd $1, %xmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %r1 = extractelement <16 x i32> %x, i32 1
   %r2 = extractelement <16 x i32> %x, i32 5
   store i32 %r2, i32* %dst, align 1
@@ -523,20 +436,13 @@ define i32 @extract_v16i32(<16 x i32> %x
 }
 
 define i32 @extract_v8i32(<8 x i32> %x, i32* %dst) {
-; KNL-LABEL: extract_v8i32:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpextrd $1, %xmm0, %eax
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT:    vpextrd $1, %xmm0, (%rdi)
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: extract_v8i32:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpextrd $1, %xmm0, %eax
-; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT:    vpextrd $1, %xmm0, (%rdi)
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
+; CHECK-LABEL: extract_v8i32:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpextrd $1, %xmm0, %eax
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpextrd $1, %xmm0, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %r1 = extractelement <8 x i32> %x, i32 1
   %r2 = extractelement <8 x i32> %x, i32 5
   store i32 %r2, i32* %dst, align 1
@@ -556,22 +462,14 @@ define i32 @extract_v4i32(<4 x i32> %x,
 }
 
 define i16 @extract_v32i16(<32 x i16> %x, i16* %dst) {
-; KNL-LABEL: extract_v32i16:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpextrw $1, %xmm0, %eax
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT:    vpextrw $1, %xmm0, (%rdi)
-; KNL-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: extract_v32i16:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpextrw $1, %xmm0, %eax
-; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT:    vpextrw $1, %xmm0, (%rdi)
-; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
+; CHECK-LABEL: extract_v32i16:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpextrw $1, %xmm0, %eax
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpextrw $1, %xmm0, (%rdi)
+; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %r1 = extractelement <32 x i16> %x, i32 1
   %r2 = extractelement <32 x i16> %x, i32 9
   store i16 %r2, i16* %dst, align 1
@@ -579,22 +477,14 @@ define i16 @extract_v32i16(<32 x i16> %x
 }
 
 define i16 @extract_v16i16(<16 x i16> %x, i16* %dst) {
-; KNL-LABEL: extract_v16i16:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpextrw $1, %xmm0, %eax
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT:    vpextrw $1, %xmm0, (%rdi)
-; KNL-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: extract_v16i16:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpextrw $1, %xmm0, %eax
-; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT:    vpextrw $1, %xmm0, (%rdi)
-; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
+; CHECK-LABEL: extract_v16i16:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpextrw $1, %xmm0, %eax
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpextrw $1, %xmm0, (%rdi)
+; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %r1 = extractelement <16 x i16> %x, i32 1
   %r2 = extractelement <16 x i16> %x, i32 9
   store i16 %r2, i16* %dst, align 1
@@ -615,22 +505,14 @@ define i16 @extract_v8i16(<8 x i16> %x,
 }
 
 define i8 @extract_v64i8(<64 x i8> %x, i8* %dst) {
-; KNL-LABEL: extract_v64i8:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpextrb $1, %xmm0, %eax
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT:    vpextrb $1, %xmm0, (%rdi)
-; KNL-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: extract_v64i8:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpextrb $1, %xmm0, %eax
-; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT:    vpextrb $1, %xmm0, (%rdi)
-; SKX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
+; CHECK-LABEL: extract_v64i8:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpextrb $1, %xmm0, %eax
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpextrb $1, %xmm0, (%rdi)
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %r1 = extractelement <64 x i8> %x, i32 1
   %r2 = extractelement <64 x i8> %x, i32 17
   store i8 %r2, i8* %dst, align 1
@@ -638,22 +520,14 @@ define i8 @extract_v64i8(<64 x i8> %x, i
 }
 
 define i8 @extract_v32i8(<32 x i8> %x, i8* %dst) {
-; KNL-LABEL: extract_v32i8:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpextrb $1, %xmm0, %eax
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT:    vpextrb $1, %xmm0, (%rdi)
-; KNL-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: extract_v32i8:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpextrb $1, %xmm0, %eax
-; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT:    vpextrb $1, %xmm0, (%rdi)
-; SKX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
+; CHECK-LABEL: extract_v32i8:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpextrb $1, %xmm0, %eax
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT:    vpextrb $1, %xmm0, (%rdi)
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %r1 = extractelement <32 x i8> %x, i32 1
   %r2 = extractelement <32 x i8> %x, i32 17
   store i8 %r2, i8* %dst, align 1
@@ -936,6 +810,7 @@ define i32 @test_insertelement_v32i1(i32
 ; KNL-NEXT:    subq $32, %rsp
 ; KNL-NEXT:    xorl %eax, %eax
 ; KNL-NEXT:    cmpl %esi, %edi
+; KNL-NEXT:    setb %al
 ; KNL-NEXT:    vpcmpltud %zmm3, %zmm1, %k0
 ; KNL-NEXT:    kshiftlw $14, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
@@ -1061,7 +936,6 @@ define i32 @test_insertelement_v32i1(i32
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
-; KNL-NEXT:    setb %al
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    vpinsrb $15, %ecx, %xmm0, %xmm0
@@ -1083,6 +957,7 @@ define i32 @test_insertelement_v32i1(i32
 ; KNL-NEXT:    movl (%rsp), %eax
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_insertelement_v32i1:
@@ -1112,23 +987,23 @@ define i8 @test_iinsertelement_v4i1(i32
 ; KNL-LABEL: test_iinsertelement_v4i1:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    cmpl %esi, %edi
+; KNL-NEXT:    setb %al
 ; KNL-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; KNL-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; KNL-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; KNL-NEXT:    vpcmpgtd %xmm0, %xmm1, %xmm0
-; KNL-NEXT:    vpextrb $4, %xmm0, %eax
-; KNL-NEXT:    setb %cl
-; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vpextrb $4, %xmm0, %ecx
+; KNL-NEXT:    kmovw %ecx, %k1
 ; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT:    vpextrb $0, %xmm0, %eax
-; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vpextrb $0, %xmm0, %ecx
+; KNL-NEXT:    kmovw %ecx, %k1
 ; KNL-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
 ; KNL-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
 ; KNL-NEXT:    vpsllq $63, %zmm3, %zmm1
 ; KNL-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT:    kmovw %ecx, %k1
+; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
 ; KNL-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
 ; KNL-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
@@ -1144,6 +1019,7 @@ define i8 @test_iinsertelement_v4i1(i32
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_iinsertelement_v4i1:
@@ -1188,6 +1064,7 @@ define i8 @test_iinsertelement_v2i1(i32
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_iinsertelement_v2i1:
@@ -1308,6 +1185,7 @@ define zeroext i8 @test_extractelement_v
 ; KNL-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
 ; KNL-NEXT:    vpextrb $2, %xmm0, %eax
 ; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_v32i1:
@@ -1338,6 +1216,7 @@ define zeroext i8 @test_extractelement_v
 ; KNL-NEXT:    movb $4, %cl
 ; KNL-NEXT:    subb %al, %cl
 ; KNL-NEXT:    movzbl %cl, %eax
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_v64i1:
@@ -1370,6 +1249,7 @@ define zeroext i8 @extractelement_v64i1_
 ; KNL-NEXT:    movb $4, %cl
 ; KNL-NEXT:    subb %al, %cl
 ; KNL-NEXT:    movzbl %cl, %eax
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: extractelement_v64i1_alt:
@@ -1421,6 +1301,7 @@ define i64 @test_extractelement_variable
 ; KNL-NEXT:    movq (%rsp,%rdi,8), %rax
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_variable_v4i64:
@@ -1466,6 +1347,7 @@ define i64 @test_extractelement_variable
 ; KNL-NEXT:    movq (%rsp,%rdi,8), %rax
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_variable_v8i64:
@@ -1523,6 +1405,7 @@ define double @test_extractelement_varia
 ; KNL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_variable_v4f64:
@@ -1568,6 +1451,7 @@ define double @test_extractelement_varia
 ; KNL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_variable_v8f64:
@@ -1625,6 +1509,7 @@ define i32 @test_extractelement_variable
 ; KNL-NEXT:    movl (%rsp,%rdi,4), %eax
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_variable_v8i32:
@@ -1670,6 +1555,7 @@ define i32 @test_extractelement_variable
 ; KNL-NEXT:    movl (%rsp,%rdi,4), %eax
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_variable_v16i32:
@@ -1727,6 +1613,7 @@ define float @test_extractelement_variab
 ; KNL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_variable_v8f32:
@@ -1772,6 +1659,7 @@ define float @test_extractelement_variab
 ; KNL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_variable_v16f32:
@@ -1829,6 +1717,7 @@ define i16 @test_extractelement_variable
 ; KNL-NEXT:    movzwl (%rsp,%rdi,2), %eax
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_variable_v16i16:
@@ -1875,6 +1764,7 @@ define i16 @test_extractelement_variable
 ; KNL-NEXT:    movzwl (%rsp,%rdi,2), %eax
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_variable_v32i16:
@@ -1902,23 +1792,14 @@ define i16 @test_extractelement_variable
 }
 
 define i8 @test_extractelement_variable_v16i8(<16 x i8> %t1, i32 %index) {
-; KNL-LABEL: test_extractelement_variable_v16i8:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT:    andl $15, %edi
-; KNL-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
-; KNL-NEXT:    movb (%rdi,%rax), %al
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: test_extractelement_variable_v16i8:
-; SKX:       ## BB#0:
-; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; SKX-NEXT:    andl $15, %edi
-; SKX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
-; SKX-NEXT:    movb (%rdi,%rax), %al
-; SKX-NEXT:    retq
+; CHECK-LABEL: test_extractelement_variable_v16i8:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; CHECK-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    andl $15, %edi
+; CHECK-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT:    movb (%rdi,%rax), %al
+; CHECK-NEXT:    retq
   %t2 = extractelement <16 x i8> %t1, i32 %index
   ret i8 %t2
 }
@@ -1936,13 +1817,14 @@ define i8 @test_extractelement_variable_
 ; KNL-NEXT:    .cfi_def_cfa_register %rbp
 ; KNL-NEXT:    andq $-32, %rsp
 ; KNL-NEXT:    subq $64, %rsp
-; KNL-NEXT:    vmovaps %ymm0, (%rsp)
 ; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %ymm0, (%rsp)
 ; KNL-NEXT:    andl $31, %edi
 ; KNL-NEXT:    movq %rsp, %rax
 ; KNL-NEXT:    movb (%rdi,%rax), %al
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_variable_v32i8:
@@ -1984,14 +1866,15 @@ define i8 @test_extractelement_variable_
 ; KNL-NEXT:    .cfi_def_cfa_register %rbp
 ; KNL-NEXT:    andq $-64, %rsp
 ; KNL-NEXT:    subq $128, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
 ; KNL-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
 ; KNL-NEXT:    vmovaps %ymm0, (%rsp)
-; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
 ; KNL-NEXT:    andl $63, %edi
 ; KNL-NEXT:    movq %rsp, %rax
 ; KNL-NEXT:    movb (%rdi,%rax), %al
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_variable_v64i8:
@@ -2042,6 +1925,7 @@ define i8 @test_extractelement_variable_
 ; KNL-NEXT:    movb (%rax,%rcx), %al
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_variable_v64i8_indexi8:
@@ -2075,12 +1959,12 @@ define i8 @test_extractelement_variable_
 define zeroext i8 @test_extractelement_varible_v2i1(<2 x i64> %a, <2 x i64> %b, i32 %index) {
 ; KNL-LABEL: test_extractelement_varible_v2i1:
 ; KNL:       ## BB#0:
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
 ; KNL-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; KNL-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; KNL-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; KNL-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
-; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
 ; KNL-NEXT:    andl $1, %edi
 ; KNL-NEXT:    movl -24(%rsp,%rdi,8), %eax
 ; KNL-NEXT:    andl $1, %eax
@@ -2105,12 +1989,12 @@ define zeroext i8 @test_extractelement_v
 define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b, i32 %index) {
 ; KNL-LABEL: test_extractelement_varible_v4i1:
 ; KNL:       ## BB#0:
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
 ; KNL-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; KNL-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; KNL-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; KNL-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
-; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
 ; KNL-NEXT:    andl $3, %edi
 ; KNL-NEXT:    movl -24(%rsp,%rdi,4), %eax
 ; KNL-NEXT:    andl $1, %eax
@@ -2156,6 +2040,7 @@ define zeroext i8 @test_extractelement_v
 ; KNL-NEXT:    andl $1, %eax
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_varible_v8i1:
@@ -2209,6 +2094,7 @@ define zeroext i8 @test_extractelement_v
 ; KNL-NEXT:    andl $1, %eax
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_varible_v16i1:
@@ -2265,6 +2151,7 @@ define zeroext i8 @test_extractelement_v
 ; KNL-NEXT:    andl $1, %eax
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_extractelement_varible_v32i1:

Modified: llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll?rev=314306&r1=314305&r2=314306&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll Wed Sep 27 07:44:15 2017
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
-; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
+; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
 ; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mattr=+avx512bw  | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512BW
 ; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mattr=+avx512dq  | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512DQ
 
@@ -287,7 +287,6 @@ define i8 @shuf_test1(i16 %v) nounwind {
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; KNL-NEXT:    retq
-; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: shuf_test1:
 ; SKX:       ## BB#0:
@@ -296,7 +295,6 @@ define i8 @shuf_test1(i16 %v) nounwind {
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; SKX-NEXT:    retq
-; SKX-NEXT:    ## -- End function
 ;
 ; AVX512BW-LABEL: shuf_test1:
 ; AVX512BW:       ## BB#0:
@@ -305,7 +303,6 @@ define i8 @shuf_test1(i16 %v) nounwind {
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:    ## -- End function
 ;
 ; AVX512DQ-LABEL: shuf_test1:
 ; AVX512DQ:       ## BB#0:
@@ -314,7 +311,6 @@ define i8 @shuf_test1(i16 %v) nounwind {
 ; AVX512DQ-NEXT:    kmovw %k0, %eax
 ; AVX512DQ-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; AVX512DQ-NEXT:    retq
-; AVX512DQ-NEXT:    ## -- End function
    %v1 = bitcast i16 %v to <16 x i1>
    %mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    %mask1 = bitcast <8 x i1> %mask to i8
@@ -329,6 +325,7 @@ define i32 @zext_test1(<16 x i32> %a, <1
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: zext_test1:
@@ -375,6 +372,7 @@ define i16 @zext_test2(<16 x i32> %a, <1
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    andl $1, %eax
 ; KNL-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: zext_test2:
@@ -424,6 +422,7 @@ define i8 @zext_test3(<16 x i32> %a, <16
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    andb $1, %al
 ; KNL-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: zext_test3:
@@ -516,6 +515,7 @@ define <4 x i32> @test4(<4 x i64> %x, <4
 ; KNL-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm1
 ; KNL-NEXT:    vpmovqd %zmm1, %ymm1
 ; KNL-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test4:
@@ -611,6 +611,7 @@ define void @test7(<8 x i1> %mask)  {
 ; KNL-NEXT:    korw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    testb %al, %al
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test7:
@@ -661,8 +662,8 @@ false:
 define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
 ; KNL-LABEL: test8:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; KNL-NEXT:    cmpl %esi, %edi
+; KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; KNL-NEXT:    jg LBB17_1
 ; KNL-NEXT:  ## BB#2:
 ; KNL-NEXT:    vpcmpltud %zmm2, %zmm1, %k1
@@ -672,12 +673,13 @@ define <16 x i8> @test8(<16 x i32>%a, <1
 ; KNL-NEXT:  LBB17_3:
 ; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test8:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; SKX-NEXT:    cmpl %esi, %edi
+; SKX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; SKX-NEXT:    jg LBB17_1
 ; SKX-NEXT:  ## BB#2:
 ; SKX-NEXT:    vpcmpltud %zmm2, %zmm1, %k0
@@ -743,6 +745,7 @@ define <16 x i1> @test9(<16 x i1>%a, <16
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test9:
@@ -883,6 +886,7 @@ define <16 x i1> @test15(i32 %x, i32 %y)
 ; KNL-NEXT:    kmovw %ecx, %k1
 ; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test15:
@@ -1189,6 +1193,7 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
 ; KNL-NEXT:    korw %k1, %k0, %k1
 ; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovqw %zmm0, %xmm0
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test18:
@@ -1289,7 +1294,6 @@ define <32 x i16> @test21(<32 x i16> %x
 ; KNL-NEXT:    vpsraw $15, %ymm2, %ymm2
 ; KNL-NEXT:    vpand %ymm1, %ymm2, %ymm1
 ; KNL-NEXT:    retq
-; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test21:
 ; SKX:       ## BB#0:
@@ -1297,7 +1301,6 @@ define <32 x i16> @test21(<32 x i16> %x
 ; SKX-NEXT:    vpmovb2m %ymm1, %k1
 ; SKX-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
 ; SKX-NEXT:    retq
-; SKX-NEXT:    ## -- End function
 ;
 ; AVX512BW-LABEL: test21:
 ; AVX512BW:       ## BB#0:
@@ -1305,7 +1308,6 @@ define <32 x i16> @test21(<32 x i16> %x
 ; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    retq
-; AVX512BW-NEXT:    ## -- End function
 ;
 ; AVX512DQ-LABEL: test21:
 ; AVX512DQ:       ## BB#0:
@@ -1319,7 +1321,6 @@ define <32 x i16> @test21(<32 x i16> %x
 ; AVX512DQ-NEXT:    vpsraw $15, %ymm2, %ymm2
 ; AVX512DQ-NEXT:    vpand %ymm1, %ymm2, %ymm1
 ; AVX512DQ-NEXT:    retq
-; AVX512DQ-NEXT:    ## -- End function
   %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
   ret <32 x i16> %ret
 }
@@ -1332,6 +1333,7 @@ define void @test22(<4 x i1> %a, <4 x i1
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    movb %al, (%rdi)
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test22:
@@ -1371,6 +1373,7 @@ define void @test23(<2 x i1> %a, <2 x i1
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    movb %al, (%rdi)
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test23:
@@ -1450,6 +1453,7 @@ define void @store_v2i1(<2 x i1> %c , <2
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    movb %al, (%rdi)
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: store_v2i1:
@@ -1494,6 +1498,7 @@ define void @store_v4i1(<4 x i1> %c , <4
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    movb %al, (%rdi)
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: store_v4i1:
@@ -1538,6 +1543,7 @@ define void @store_v8i1(<8 x i1> %c , <8
 ; KNL-NEXT:    knotw %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    movb %al, (%rdi)
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: store_v8i1:
@@ -1580,6 +1586,7 @@ define void @store_v16i1(<16 x i1> %c ,
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    knotw %k0, %k0
 ; KNL-NEXT:    kmovw %k0, (%rdi)
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: store_v16i1:
@@ -1738,9 +1745,11 @@ define void @ktest_1(<8 x double> %in, d
 ; KNL-NEXT:    je LBB41_2
 ; KNL-NEXT:  ## BB#1: ## %L1
 ; KNL-NEXT:    vmovapd %zmm0, (%rdi)
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ; KNL-NEXT:  LBB41_2: ## %L2
 ; KNL-NEXT:    vmovapd %zmm0, 8(%rdi)
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: ktest_1:
@@ -1835,73 +1844,9 @@ define void @ktest_2(<32 x float> %in, f
 ; KNL-NEXT:    .cfi_def_cfa_register %rbp
 ; KNL-NEXT:    andq $-32, %rsp
 ; KNL-NEXT:    subq $32, %rsp
-; KNL-NEXT:    vmovups 64(%rdi), %zmm2
-; KNL-NEXT:    vcmpltps %zmm1, %zmm2, %k2
-; KNL-NEXT:    kshiftlw $14, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    kshiftlw $15, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %ecx
-; KNL-NEXT:    vmovd %ecx, %xmm2
-; KNL-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
-; KNL-NEXT:    kshiftlw $13, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
-; KNL-NEXT:    kshiftlw $12, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
-; KNL-NEXT:    kshiftlw $11, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
-; KNL-NEXT:    kshiftlw $10, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
-; KNL-NEXT:    kshiftlw $9, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
-; KNL-NEXT:    kshiftlw $8, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
-; KNL-NEXT:    kshiftlw $7, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
-; KNL-NEXT:    kshiftlw $6, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
-; KNL-NEXT:    kshiftlw $5, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; KNL-NEXT:    kshiftlw $4, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
-; KNL-NEXT:    kshiftlw $3, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
-; KNL-NEXT:    kshiftlw $2, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
-; KNL-NEXT:    kshiftlw $1, %k2, %k0
-; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
-; KNL-NEXT:    kshiftrw $15, %k2, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
-; KNL-NEXT:    vmovups (%rdi), %zmm3
-; KNL-NEXT:    vcmpltps %zmm0, %zmm3, %k1
+; KNL-NEXT:    vmovups (%rdi), %zmm2
+; KNL-NEXT:    vmovups 64(%rdi), %zmm3
+; KNL-NEXT:    vcmpltps %zmm1, %zmm3, %k1
 ; KNL-NEXT:    kshiftlw $14, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
@@ -1965,138 +1910,202 @@ define void @ktest_2(<32 x float> %in, f
 ; KNL-NEXT:    kshiftrw $15, %k1, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
-; KNL-NEXT:    vmovups 68(%rdi), %zmm4 {%k2} {z}
+; KNL-NEXT:    vcmpltps %zmm0, %zmm2, %k2
+; KNL-NEXT:    kshiftlw $14, %k2, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    kshiftlw $15, %k2, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    vmovd %ecx, %xmm2
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
+; KNL-NEXT:    kshiftlw $13, %k2, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
+; KNL-NEXT:    kshiftlw $12, %k2, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
+; KNL-NEXT:    kshiftlw $11, %k2, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
+; KNL-NEXT:    kshiftlw $10, %k2, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
+; KNL-NEXT:    kshiftlw $9, %k2, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
+; KNL-NEXT:    kshiftlw $8, %k2, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
+; KNL-NEXT:    kshiftlw $7, %k2, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; KNL-NEXT:    kshiftlw $6, %k2, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
+; KNL-NEXT:    kshiftlw $5, %k2, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
+; KNL-NEXT:    kshiftlw $4, %k2, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
+; KNL-NEXT:    kshiftlw $3, %k2, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; KNL-NEXT:    kshiftlw $2, %k2, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
+; KNL-NEXT:    kshiftlw $1, %k2, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
+; KNL-NEXT:    kshiftrw $15, %k2, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
+; KNL-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; KNL-NEXT:    vmovups 4(%rdi), %zmm3 {%k2} {z}
+; KNL-NEXT:    vmovups 68(%rdi), %zmm4 {%k1} {z}
 ; KNL-NEXT:    vcmpltps %zmm4, %zmm1, %k0
-; KNL-NEXT:    kshiftlw $14, %k0, %k2
-; KNL-NEXT:    kshiftrw $15, %k2, %k2
-; KNL-NEXT:    kmovw %k2, %eax
-; KNL-NEXT:    kshiftlw $15, %k0, %k2
-; KNL-NEXT:    kshiftrw $15, %k2, %k2
-; KNL-NEXT:    kmovw %k2, %ecx
+; KNL-NEXT:    kshiftlw $14, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kshiftlw $15, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    vmovd %ecx, %xmm4
 ; KNL-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $13, %k0, %k2
-; KNL-NEXT:    kshiftrw $15, %k2, %k2
-; KNL-NEXT:    kmovw %k2, %eax
+; KNL-NEXT:    kshiftlw $13, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $12, %k0, %k2
-; KNL-NEXT:    kshiftrw $15, %k2, %k2
-; KNL-NEXT:    kmovw %k2, %eax
+; KNL-NEXT:    kshiftlw $12, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $11, %k0, %k2
-; KNL-NEXT:    kshiftrw $15, %k2, %k2
-; KNL-NEXT:    kmovw %k2, %eax
+; KNL-NEXT:    kshiftlw $11, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $10, %k0, %k2
-; KNL-NEXT:    kshiftrw $15, %k2, %k2
-; KNL-NEXT:    kmovw %k2, %eax
+; KNL-NEXT:    kshiftlw $10, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $9, %k0, %k2
-; KNL-NEXT:    kshiftrw $15, %k2, %k2
-; KNL-NEXT:    kmovw %k2, %eax
+; KNL-NEXT:    kshiftlw $9, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $8, %k0, %k2
-; KNL-NEXT:    kshiftrw $15, %k2, %k2
-; KNL-NEXT:    kmovw %k2, %eax
+; KNL-NEXT:    kshiftlw $8, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $7, %k0, %k2
-; KNL-NEXT:    kshiftrw $15, %k2, %k2
-; KNL-NEXT:    kmovw %k2, %eax
+; KNL-NEXT:    kshiftlw $7, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $6, %k0, %k2
-; KNL-NEXT:    kshiftrw $15, %k2, %k2
-; KNL-NEXT:    kmovw %k2, %eax
+; KNL-NEXT:    kshiftlw $6, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $5, %k0, %k2
-; KNL-NEXT:    kshiftrw $15, %k2, %k2
-; KNL-NEXT:    kmovw %k2, %eax
+; KNL-NEXT:    kshiftlw $5, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $4, %k0, %k2
-; KNL-NEXT:    kshiftrw $15, %k2, %k2
-; KNL-NEXT:    kmovw %k2, %eax
+; KNL-NEXT:    kshiftlw $4, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $3, %k0, %k2
-; KNL-NEXT:    kshiftrw $15, %k2, %k2
-; KNL-NEXT:    kmovw %k2, %eax
+; KNL-NEXT:    kshiftlw $3, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $2, %k0, %k2
-; KNL-NEXT:    kshiftrw $15, %k2, %k2
-; KNL-NEXT:    kmovw %k2, %eax
+; KNL-NEXT:    kshiftlw $2, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
-; KNL-NEXT:    kshiftlw $1, %k0, %k2
-; KNL-NEXT:    kshiftrw $15, %k2, %k2
-; KNL-NEXT:    kmovw %k2, %eax
+; KNL-NEXT:    kshiftlw $1, %k0, %k1
+; KNL-NEXT:    kshiftrw $15, %k1, %k1
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm4
-; KNL-NEXT:    vmovups 4(%rdi), %zmm5 {%k1} {z}
-; KNL-NEXT:    vcmpltps %zmm5, %zmm0, %k0
+; KNL-NEXT:    vcmpltps %zmm3, %zmm0, %k0
 ; KNL-NEXT:    kshiftlw $14, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $15, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vmovd %ecx, %xmm5
-; KNL-NEXT:    vpinsrb $1, %eax, %xmm5, %xmm5
+; KNL-NEXT:    vmovd %ecx, %xmm3
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftlw $13, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $2, %eax, %xmm5, %xmm5
+; KNL-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftlw $12, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $3, %eax, %xmm5, %xmm5
+; KNL-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftlw $11, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $4, %eax, %xmm5, %xmm5
+; KNL-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftlw $10, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $5, %eax, %xmm5, %xmm5
+; KNL-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftlw $9, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $6, %eax, %xmm5, %xmm5
+; KNL-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftlw $8, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $7, %eax, %xmm5, %xmm5
+; KNL-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftlw $7, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $8, %eax, %xmm5, %xmm5
+; KNL-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftlw $6, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $9, %eax, %xmm5, %xmm5
+; KNL-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftlw $5, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $10, %eax, %xmm5, %xmm5
+; KNL-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftlw $4, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $11, %eax, %xmm5, %xmm5
+; KNL-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftlw $3, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $12, %eax, %xmm5, %xmm5
+; KNL-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftlw $2, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $13, %eax, %xmm5, %xmm5
+; KNL-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftlw $1, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $14, %eax, %xmm5, %xmm5
+; KNL-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $15, %eax, %xmm5, %xmm5
-; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; KNL-NEXT:    vinserti128 $1, %xmm4, %ymm5, %ymm3
+; KNL-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
+; KNL-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
 ; KNL-NEXT:    vpor %ymm3, %ymm2, %ymm2
 ; KNL-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; KNL-NEXT:    vpmovsxbd %xmm3, %zmm3
@@ -2119,6 +2128,7 @@ define void @ktest_2(<32 x float> %in, f
 ; KNL-NEXT:  LBB42_3: ## %End
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: ktest_2:
@@ -2565,6 +2575,7 @@ define <2 x i16> @load_2i1(<2 x i1>* %a)
 ; KNL-NEXT:    kmovw %eax, %k1
 ; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: load_2i1:
@@ -2602,6 +2613,7 @@ define <4 x i16> @load_4i1(<4 x i1>* %a)
 ; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovqd %zmm0, %ymm0
 ; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: load_4i1:
@@ -2730,6 +2742,7 @@ define void @store_8i1(<8 x i1>* %a, <8
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    movb %al, (%rdi)
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: store_8i1:
@@ -2768,6 +2781,7 @@ define void @store_8i1_1(<8 x i1>* %a, <
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    movb %al, (%rdi)
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: store_8i1_1:
@@ -2806,6 +2820,7 @@ define void @store_16i1(<16 x i1>* %a, <
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, (%rdi)
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: store_16i1:
@@ -2847,6 +2862,7 @@ define void @store_32i1(<32 x i1>* %a, <
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, (%rdi)
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: store_32i1:
@@ -2897,6 +2913,7 @@ define void @store_32i1_1(<32 x i1>* %a,
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, (%rdi)
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: store_32i1_1:
@@ -2941,6 +2958,36 @@ define void @store_64i1(<64 x i1>* %a, <
 ;
 ; KNL-LABEL: store_64i1:
 ; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Lcfi9:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:    pushq %r15
+; KNL-NEXT:  Lcfi10:
+; KNL-NEXT:    .cfi_def_cfa_offset 24
+; KNL-NEXT:    pushq %r14
+; KNL-NEXT:  Lcfi11:
+; KNL-NEXT:    .cfi_def_cfa_offset 32
+; KNL-NEXT:    pushq %r13
+; KNL-NEXT:  Lcfi12:
+; KNL-NEXT:    .cfi_def_cfa_offset 40
+; KNL-NEXT:    pushq %r12
+; KNL-NEXT:  Lcfi13:
+; KNL-NEXT:    .cfi_def_cfa_offset 48
+; KNL-NEXT:    pushq %rbx
+; KNL-NEXT:  Lcfi14:
+; KNL-NEXT:    .cfi_def_cfa_offset 56
+; KNL-NEXT:  Lcfi15:
+; KNL-NEXT:    .cfi_offset %rbx, -56
+; KNL-NEXT:  Lcfi16:
+; KNL-NEXT:    .cfi_offset %r12, -48
+; KNL-NEXT:  Lcfi17:
+; KNL-NEXT:    .cfi_offset %r13, -40
+; KNL-NEXT:  Lcfi18:
+; KNL-NEXT:    .cfi_offset %r14, -32
+; KNL-NEXT:  Lcfi19:
+; KNL-NEXT:    .cfi_offset %r15, -24
+; KNL-NEXT:  Lcfi20:
+; KNL-NEXT:    .cfi_offset %rbp, -16
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -2952,66 +2999,66 @@ define void @store_64i1(<64 x i1>* %a, <
 ; KNL-NEXT:    vptestmd %zmm3, %zmm3, %k0
 ; KNL-NEXT:    kshiftlw $14, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kmovw %k1, %r8d
 ; KNL-NEXT:    kshiftlw $15, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    kmovw %k1, %r9d
 ; KNL-NEXT:    kshiftlw $13, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    kmovw %k1, %r10d
 ; KNL-NEXT:    kshiftlw $12, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vmovd %ecx, %xmm3
-; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    kmovw %k1, %r11d
 ; KNL-NEXT:    kshiftlw $11, %k0, %k1
-; KNL-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kmovw %k1, %r14d
 ; KNL-NEXT:    kshiftlw $10, %k0, %k1
-; KNL-NEXT:    vpinsrb $2, %edx, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    kmovw %k1, %r15d
 ; KNL-NEXT:    kshiftlw $9, %k0, %k1
-; KNL-NEXT:    vpinsrb $3, %ecx, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    kmovw %k1, %r12d
 ; KNL-NEXT:    kshiftlw $8, %k0, %k1
-; KNL-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kmovw %k1, %r13d
 ; KNL-NEXT:    kshiftlw $7, %k0, %k1
-; KNL-NEXT:    vpinsrb $5, %edx, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    kmovw %k1, %ebx
 ; KNL-NEXT:    kshiftlw $6, %k0, %k1
-; KNL-NEXT:    vpinsrb $6, %ecx, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    kmovw %k1, %ebp
 ; KNL-NEXT:    kshiftlw $5, %k0, %k1
-; KNL-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $4, %k0, %k1
-; KNL-NEXT:    vpinsrb $8, %edx, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $3, %k0, %k1
-; KNL-NEXT:    vpinsrb $9, %ecx, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    kmovw %k1, %edx
 ; KNL-NEXT:    kshiftlw $2, %k0, %k1
-; KNL-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kmovw %k1, %esi
 ; KNL-NEXT:    kshiftlw $1, %k0, %k1
-; KNL-NEXT:    vpinsrb $11, %edx, %xmm3, %xmm3
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    vmovd %r9d, %xmm3
+; KNL-NEXT:    kmovw %k1, %r9d
 ; KNL-NEXT:    vptestmd %zmm2, %zmm2, %k2
-; KNL-NEXT:    vpinsrb $12, %ecx, %xmm3, %xmm2
-; KNL-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
-; KNL-NEXT:    vpinsrb $14, %edx, %xmm2, %xmm2
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    vpinsrb $1, %r8d, %xmm3, %xmm2
+; KNL-NEXT:    vpinsrb $2, %r10d, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $3, %r11d, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $4, %r14d, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $5, %r15d, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $6, %r12d, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $7, %r13d, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $8, %ebx, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $9, %ebp, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $11, %ecx, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $12, %edx, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $13, %esi, %xmm2, %xmm2
+; KNL-NEXT:    vpinsrb $14, %r9d, %xmm2, %xmm2
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
 ; KNL-NEXT:    vpmovsxbd %xmm2, %zmm2
@@ -3020,66 +3067,66 @@ define void @store_64i1(<64 x i1>* %a, <
 ; KNL-NEXT:    kmovw %k0, 6(%rdi)
 ; KNL-NEXT:    kshiftlw $14, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    kmovw %k0, %r8d
 ; KNL-NEXT:    kshiftlw $15, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    kmovw %k0, %r10d
 ; KNL-NEXT:    kshiftlw $13, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    kmovw %k0, %r9d
 ; KNL-NEXT:    kshiftlw $12, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vmovd %ecx, %xmm2
-; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    kmovw %k0, %r11d
 ; KNL-NEXT:    kshiftlw $11, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
-; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    kmovw %k0, %r14d
 ; KNL-NEXT:    kshiftlw $10, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $2, %edx, %xmm2, %xmm2
-; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    kmovw %k0, %r15d
 ; KNL-NEXT:    kshiftlw $9, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $3, %ecx, %xmm2, %xmm2
-; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    kmovw %k0, %r12d
 ; KNL-NEXT:    kshiftlw $8, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
-; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    kmovw %k0, %r13d
 ; KNL-NEXT:    kshiftlw $7, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $5, %edx, %xmm2, %xmm2
-; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    kshiftlw $6, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $6, %ecx, %xmm2, %xmm2
-; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    kmovw %k0, %esi
 ; KNL-NEXT:    kshiftlw $5, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
-; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    kmovw %k0, %ebp
 ; KNL-NEXT:    kshiftlw $4, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $8, %edx, %xmm2, %xmm2
-; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    kmovw %k0, %ebx
 ; KNL-NEXT:    kshiftlw $3, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $9, %ecx, %xmm2, %xmm2
-; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    kshiftlw $2, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
-; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    kmovw %k0, %edx
 ; KNL-NEXT:    kshiftlw $1, %k2, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $11, %edx, %xmm2, %xmm2
-; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    vmovd %r10d, %xmm2
+; KNL-NEXT:    kmovw %k0, %r10d
 ; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; KNL-NEXT:    kshiftrw $15, %k2, %k0
-; KNL-NEXT:    vpinsrb $12, %ecx, %xmm2, %xmm1
-; KNL-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
-; KNL-NEXT:    vpinsrb $14, %edx, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $1, %r8d, %xmm2, %xmm1
+; KNL-NEXT:    vpinsrb $2, %r9d, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $3, %r11d, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $4, %r14d, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $5, %r15d, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $6, %r12d, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $7, %r13d, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $8, %ecx, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $9, %esi, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $10, %ebp, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $11, %ebx, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $13, %edx, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $14, %r10d, %xmm1, %xmm1
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
@@ -3088,139 +3135,146 @@ define void @store_64i1(<64 x i1>* %a, <
 ; KNL-NEXT:    kmovw %k0, 4(%rdi)
 ; KNL-NEXT:    kshiftlw $14, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    kmovw %k0, %r8d
 ; KNL-NEXT:    kshiftlw $15, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    kmovw %k0, %r10d
 ; KNL-NEXT:    kshiftlw $13, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    kmovw %k0, %r9d
 ; KNL-NEXT:    kshiftlw $12, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vmovd %ecx, %xmm1
-; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    kmovw %k0, %r11d
 ; KNL-NEXT:    kshiftlw $11, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
-; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    kmovw %k0, %r14d
 ; KNL-NEXT:    kshiftlw $10, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $2, %edx, %xmm1, %xmm1
-; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    kmovw %k0, %r15d
 ; KNL-NEXT:    kshiftlw $9, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $3, %ecx, %xmm1, %xmm1
-; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    kmovw %k0, %r12d
 ; KNL-NEXT:    kshiftlw $8, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
-; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    kmovw %k0, %r13d
 ; KNL-NEXT:    kshiftlw $7, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $5, %edx, %xmm1, %xmm1
-; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    kmovw %k0, %ecx
 ; KNL-NEXT:    kshiftlw $6, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $6, %ecx, %xmm1, %xmm1
-; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    kmovw %k0, %esi
 ; KNL-NEXT:    kshiftlw $5, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
-; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    kmovw %k0, %ebp
 ; KNL-NEXT:    kshiftlw $4, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $8, %edx, %xmm1, %xmm1
-; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    kmovw %k0, %ebx
 ; KNL-NEXT:    kshiftlw $3, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $9, %ecx, %xmm1, %xmm1
-; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    kshiftlw $2, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
-; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    kmovw %k0, %edx
 ; KNL-NEXT:    kshiftlw $1, %k1, %k0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpinsrb $11, %edx, %xmm1, %xmm1
-; KNL-NEXT:    kmovw %k0, %edx
+; KNL-NEXT:    vmovd %r10d, %xmm1
+; KNL-NEXT:    kmovw %k0, %r10d
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $12, %ecx, %xmm1, %xmm0
-; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $1, %r8d, %xmm1, %xmm0
+; KNL-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $9, %esi, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $10, %ebp, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $11, %ebx, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $14, %r10d, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; KNL-NEXT:    kmovw %k1, 2(%rdi)
 ; KNL-NEXT:    kshiftlw $14, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kmovw %k1, %r8d
 ; KNL-NEXT:    kshiftlw $15, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $14, %edx, %xmm0, %xmm0
-; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    kmovw %k1, %r9d
 ; KNL-NEXT:    kshiftlw $13, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vpinsrb $15, %ecx, %xmm0, %xmm0
-; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    kmovw %k1, %r10d
 ; KNL-NEXT:    kshiftlw $12, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    vmovd %edx, %xmm1
-; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    kmovw %k1, %r11d
 ; KNL-NEXT:    kshiftlw $11, %k0, %k1
-; KNL-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kmovw %k1, %r14d
 ; KNL-NEXT:    kshiftlw $10, %k0, %k1
-; KNL-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    kmovw %k1, %r15d
 ; KNL-NEXT:    kshiftlw $9, %k0, %k1
-; KNL-NEXT:    vpinsrb $3, %edx, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    kmovw %k1, %r12d
 ; KNL-NEXT:    kshiftlw $8, %k0, %k1
-; KNL-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kmovw %k1, %r13d
 ; KNL-NEXT:    kshiftlw $7, %k0, %k1
-; KNL-NEXT:    vpinsrb $5, %ecx, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    kmovw %k1, %edx
 ; KNL-NEXT:    kshiftlw $6, %k0, %k1
-; KNL-NEXT:    vpinsrb $6, %edx, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    kmovw %k1, %esi
 ; KNL-NEXT:    kshiftlw $5, %k0, %k1
-; KNL-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kmovw %k1, %ebp
 ; KNL-NEXT:    kshiftlw $4, %k0, %k1
-; KNL-NEXT:    vpinsrb $8, %ecx, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    kmovw %k1, %ebx
 ; KNL-NEXT:    kshiftlw $3, %k0, %k1
-; KNL-NEXT:    vpinsrb $9, %edx, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $2, %k0, %k1
-; KNL-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $1, %k0, %k1
-; KNL-NEXT:    vpinsrb $11, %ecx, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vpinsrb $12, %edx, %xmm1, %xmm1
+; KNL-NEXT:    vmovd %r9d, %xmm0
+; KNL-NEXT:    kmovw %k1, %r9d
+; KNL-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $2, %r10d, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $8, %edx, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $9, %esi, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $10, %ebp, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $11, %ebx, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
-; KNL-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
+; KNL-NEXT:    vpinsrb $14, %r9d, %xmm0, %xmm0
 ; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT:    vpinsrb $14, %ecx, %xmm1, %xmm0
 ; KNL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; KNL-NEXT:    kmovw %k0, 2(%rdi)
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, (%rdi)
+; KNL-NEXT:    popq %rbx
+; KNL-NEXT:    popq %r12
+; KNL-NEXT:    popq %r13
+; KNL-NEXT:    popq %r14
+; KNL-NEXT:    popq %r15
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: store_64i1:
@@ -3571,6 +3625,7 @@ define i32 @test_bitcast_v8i1_zext(<16 x
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    movzbl %al, %eax
 ; KNL-NEXT:    addl %eax, %eax
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_bitcast_v8i1_zext:
@@ -3609,40 +3664,14 @@ define i32 @test_bitcast_v8i1_zext(<16 x
 }
 
 define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) {
-; KNL-LABEL: test_bitcast_v16i1_zext:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    addl %eax, %eax
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: test_bitcast_v16i1_zext:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; SKX-NEXT:    kmovw %k0, %eax
-; SKX-NEXT:    addl %eax, %eax
-; SKX-NEXT:    vzeroupper
-; SKX-NEXT:    retq
-;
-; AVX512BW-LABEL: test_bitcast_v16i1_zext:
-; AVX512BW:       ## BB#0:
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; AVX512BW-NEXT:    kmovw %k0, %eax
-; AVX512BW-NEXT:    addl %eax, %eax
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512DQ-LABEL: test_bitcast_v16i1_zext:
-; AVX512DQ:       ## BB#0:
-; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; AVX512DQ-NEXT:    kmovw %k0, %eax
-; AVX512DQ-NEXT:    addl %eax, %eax
-; AVX512DQ-NEXT:    vzeroupper
-; AVX512DQ-NEXT:    retq
+; CHECK-LABEL: test_bitcast_v16i1_zext:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    addl %eax, %eax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
    %v1 = icmp eq <16 x i32> %a, zeroinitializer
    %mask1 = bitcast <16 x i1> %v1 to i16
    %val = zext i16 %mask1 to i32

Modified: llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll?rev=314306&r1=314305&r2=314306&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll Wed Sep 27 07:44:15 2017
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
 
 define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind {
 ; CHECK-LABEL: test1:
@@ -8,7 +8,6 @@ define <16 x float> @test1(<16 x float>
 ; CHECK-NEXT:    vcmpleps %zmm1, %zmm0, %k1
 ; CHECK-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
   %mask = fcmp ole <16 x float> %x, %y
   %max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y
   ret <16 x float> %max
@@ -20,7 +19,6 @@ define <8 x double> @test2(<8 x double>
 ; CHECK-NEXT:    vcmplepd %zmm1, %zmm0, %k1
 ; CHECK-NEXT:    vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
   %mask = fcmp ole <8 x double> %x, %y
   %max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y
   ret <8 x double> %max
@@ -32,7 +30,6 @@ define <16 x i32> @test3(<16 x i32> %x,
 ; CHECK-NEXT:    vpcmpeqd (%rdi), %zmm0, %k1
 ; CHECK-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
   %y = load <16 x i32>, <16 x i32>* %yp, align 4
   %mask = icmp eq <16 x i32> %x, %y
   %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -45,7 +42,6 @@ define <16 x i32> @test4_unsigned(<16 x
 ; CHECK-NEXT:    vpcmpnltud %zmm1, %zmm0, %k1
 ; CHECK-NEXT:    vpblendmd %zmm2, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
   %mask = icmp uge <16 x i32> %x, %y
   %max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
   ret <16 x i32> %max
@@ -57,7 +53,6 @@ define <8 x i64> @test5(<8 x i64> %x, <8
 ; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k1
 ; CHECK-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
   %mask = icmp eq <8 x i64> %x, %y
   %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
   ret <8 x i64> %max
@@ -69,7 +64,6 @@ define <8 x i64> @test6_unsigned(<8 x i6
 ; CHECK-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k1
 ; CHECK-NEXT:    vpblendmq %zmm2, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
   %mask = icmp ugt <8 x i64> %x, %y
   %max = select <8 x i1> %mask, <8 x i64> %x1, <8 x i64> %y
   ret <8 x i64> %max
@@ -123,14 +117,12 @@ define <8 x i32> @test9(<8 x i32> %x, <8
 ; KNL-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
 ; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; KNL-NEXT:    retq
-; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test9:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpcmpeqd %ymm1, %ymm0, %k1
 ; SKX-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
 ; SKX-NEXT:    retq
-; SKX-NEXT:    ## -- End function
   %mask = icmp eq <8 x i32> %x, %y
   %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
   ret <8 x i32> %max
@@ -145,14 +137,12 @@ define <8 x float> @test10(<8 x float> %
 ; KNL-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
 ; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; KNL-NEXT:    retq
-; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test10:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpeqps %ymm1, %ymm0, %k1
 ; SKX-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
 ; SKX-NEXT:    retq
-; SKX-NEXT:    ## -- End function
 
   %mask = fcmp oeq <8 x float> %x, %y
   %max = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
@@ -164,7 +154,6 @@ define <8 x i32> @test11_unsigned(<8 x i
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
   %mask = icmp ugt <8 x i32> %x, %y
   %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
   ret <8 x i32> %max
@@ -178,8 +167,8 @@ define i16 @test12(<16 x i64> %a, <16 x
 ; KNL-NEXT:    kunpckbw %k0, %k1, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
-; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test12:
 ; SKX:       ## BB#0:
@@ -190,7 +179,6 @@ define i16 @test12(<16 x i64> %a, <16 x
 ; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
-; SKX-NEXT:    ## -- End function
   %res = icmp eq <16 x i64> %a, %b
   %res1 = bitcast <16 x i1> %res to i16
   ret i16 %res1
@@ -269,6 +257,8 @@ define i32 @test12_v32i32(<32 x i32> %a,
 ; KNL-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
 ; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
+; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
 ; KNL-NEXT:    vpcmpeqd %zmm2, %zmm0, %k0
 ; KNL-NEXT:    kshiftlw $14, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
@@ -325,13 +315,11 @@ define i32 @test12_v32i32(<32 x i32> %a,
 ; KNL-NEXT:    kshiftlw $2, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $1, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k1
-; KNL-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
-; KNL-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
-; KNL-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
@@ -342,8 +330,8 @@ define i32 @test12_v32i32(<32 x i32> %a,
 ; KNL-NEXT:    movl (%rsp), %eax
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
-; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test12_v32i32:
 ; SKX:       ## BB#0:
@@ -353,7 +341,6 @@ define i32 @test12_v32i32(<32 x i32> %a,
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
-; SKX-NEXT:    ## -- End function
   %res = icmp eq <32 x i32> %a, %b
   %res1 = bitcast <32 x i1> %res to i32
   ret i32 %res1
@@ -577,75 +564,75 @@ define i64 @test12_v64i16(<64 x i16> %a,
 ; KNL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
-; KNL-NEXT:    vpcmpeqw %ymm6, %ymm2, %ymm1
-; KNL-NEXT:    vpmovsxwd %ymm1, %zmm1
-; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
-; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vpcmpeqw %ymm6, %ymm2, %ymm0
+; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kshiftlw $14, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
 ; KNL-NEXT:    kshiftlw $15, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vmovd %ecx, %xmm1
-; KNL-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; KNL-NEXT:    vmovd %ecx, %xmm0
+; KNL-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $13, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $12, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $11, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $10, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $9, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $8, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $7, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $6, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $5, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $4, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $3, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $2, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
+; KNL-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $1, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
 ; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
-; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; KNL-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm0
-; KNL-NEXT:    kmovw %k1, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -656,8 +643,8 @@ define i64 @test12_v64i16(<64 x i16> %a,
 ; KNL-NEXT:    orq %rcx, %rax
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
-; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test12_v64i16:
 ; SKX:       ## BB#0:
@@ -667,7 +654,6 @@ define i64 @test12_v64i16(<64 x i16> %a,
 ; SKX-NEXT:    kmovq %k0, %rax
 ; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
-; SKX-NEXT:    ## -- End function
   %res = icmp eq <64 x i16> %a, %b
   %res1 = bitcast <64 x i1> %res to i64
   ret i64 %res1
@@ -721,7 +707,6 @@ define <16 x i32> @test16(<16 x i32> %x,
 ; CHECK-NEXT:    vpcmpled %zmm0, %zmm1, %k1
 ; CHECK-NEXT:    vpblendmd %zmm2, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
   %mask = icmp sge <16 x i32> %x, %y
   %max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
   ret <16 x i32> %max
@@ -733,7 +718,6 @@ define <16 x i32> @test17(<16 x i32> %x,
 ; CHECK-NEXT:    vpcmpgtd (%rdi), %zmm0, %k1
 ; CHECK-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
   %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
   %mask = icmp sgt <16 x i32> %x, %y
   %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -746,7 +730,6 @@ define <16 x i32> @test18(<16 x i32> %x,
 ; CHECK-NEXT:    vpcmpled (%rdi), %zmm0, %k1
 ; CHECK-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
   %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
   %mask = icmp sle <16 x i32> %x, %y
   %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -759,7 +742,6 @@ define <16 x i32> @test19(<16 x i32> %x,
 ; CHECK-NEXT:    vpcmpleud (%rdi), %zmm0, %k1
 ; CHECK-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
   %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
   %mask = icmp ule <16 x i32> %x, %y
   %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -773,7 +755,6 @@ define <16 x i32> @test20(<16 x i32> %x,
 ; CHECK-NEXT:    vpcmpeqd %zmm3, %zmm2, %k1 {%k1}
 ; CHECK-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
   %mask1 = icmp eq <16 x i32> %x1, %y1
   %mask0 = icmp eq <16 x i32> %x, %y
   %mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
@@ -788,7 +769,6 @@ define <8 x i64> @test21(<8 x i64> %x, <
 ; CHECK-NEXT:    vpcmpleq %zmm2, %zmm3, %k1 {%k1}
 ; CHECK-NEXT:    vpblendmq %zmm0, %zmm2, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
   %mask1 = icmp sge <8 x i64> %x1, %y1
   %mask0 = icmp sle <8 x i64> %x, %y
   %mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
@@ -803,7 +783,6 @@ define <8 x i64> @test22(<8 x i64> %x, <
 ; CHECK-NEXT:    vpcmpgtq (%rdi), %zmm0, %k1 {%k1}
 ; CHECK-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
   %mask1 = icmp sgt <8 x i64> %x1, %y1
   %y = load <8 x i64>, <8 x i64>* %y.ptr, align 4
   %mask0 = icmp sgt <8 x i64> %x, %y
@@ -819,7 +798,6 @@ define <16 x i32> @test23(<16 x i32> %x,
 ; CHECK-NEXT:    vpcmpleud (%rdi), %zmm0, %k1 {%k1}
 ; CHECK-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
   %mask1 = icmp sge <16 x i32> %x1, %y1
   %y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
   %mask0 = icmp ule <16 x i32> %x, %y
@@ -834,7 +812,6 @@ define <8 x i64> @test24(<8 x i64> %x, <
 ; CHECK-NEXT:    vpcmpeqq (%rdi){1to8}, %zmm0, %k1
 ; CHECK-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
   %yb = load i64, i64* %yb.ptr, align 4
   %y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
   %y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -849,7 +826,6 @@ define <16 x i32> @test25(<16 x i32> %x,
 ; CHECK-NEXT:    vpcmpled (%rdi){1to16}, %zmm0, %k1
 ; CHECK-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
   %yb = load i32, i32* %yb.ptr, align 4
   %y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
   %y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -865,7 +841,6 @@ define <16 x i32> @test26(<16 x i32> %x,
 ; CHECK-NEXT:    vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1}
 ; CHECK-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
   %mask1 = icmp sge <16 x i32> %x1, %y1
   %yb = load i32, i32* %yb.ptr, align 4
   %y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
@@ -883,7 +858,6 @@ define <8 x i64> @test27(<8 x i64> %x, i
 ; CHECK-NEXT:    vpcmpleq (%rdi){1to8}, %zmm0, %k1 {%k1}
 ; CHECK-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
   %mask1 = icmp sge <8 x i64> %x1, %y1
   %yb = load i64, i64* %yb.ptr, align 4
   %y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
@@ -926,6 +900,7 @@ define <16 x i8>@test29(<16 x i32> %x, <
 ; KNL-NEXT:    kxorw %k1, %k0, %k1
 ; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test29:
@@ -949,14 +924,12 @@ define <4 x double> @test30(<4 x double>
 ; KNL-NEXT:    vcmpeqpd %ymm1, %ymm0, %ymm2
 ; KNL-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
 ; KNL-NEXT:    retq
-; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test30:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
 ; SKX-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
 ; SKX-NEXT:    retq
-; SKX-NEXT:    ## -- End function
 
   %mask = fcmp oeq <4 x double> %x, %y
   %max = select <4 x i1> %mask, <4 x double> %x, <4 x double> %y
@@ -969,14 +942,12 @@ define <2 x double> @test31(<2 x double>
 ; KNL-NEXT:    vcmpltpd (%rdi), %xmm0, %xmm2
 ; KNL-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; KNL-NEXT:    retq
-; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test31:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpltpd (%rdi), %xmm0, %k1
 ; SKX-NEXT:    vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
 ; SKX-NEXT:    retq
-; SKX-NEXT:    ## -- End function
 
   %y = load <2 x double>, <2 x double>* %yp, align 4
   %mask = fcmp olt <2 x double> %x, %y
@@ -990,14 +961,12 @@ define <4 x double> @test32(<4 x double>
 ; KNL-NEXT:    vcmpltpd (%rdi), %ymm0, %ymm2
 ; KNL-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
 ; KNL-NEXT:    retq
-; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test32:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpltpd (%rdi), %ymm0, %k1
 ; SKX-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
 ; SKX-NEXT:    retq
-; SKX-NEXT:    ## -- End function
 
   %y = load <4 x double>, <4 x double>* %yp, align 4
   %mask = fcmp ogt <4 x double> %y, %x
@@ -1011,7 +980,6 @@ define <8 x double> @test33(<8 x double>
 ; CHECK-NEXT:    vcmpltpd (%rdi), %zmm0, %k1
 ; CHECK-NEXT:    vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
   %y = load <8 x double>, <8 x double>* %yp, align 4
   %mask = fcmp olt <8 x double> %x, %y
   %max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %x1
@@ -1024,14 +992,12 @@ define <4 x float> @test34(<4 x float> %
 ; KNL-NEXT:    vcmpltps (%rdi), %xmm0, %xmm2
 ; KNL-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; KNL-NEXT:    retq
-; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test34:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpltps (%rdi), %xmm0, %k1
 ; SKX-NEXT:    vblendmps %xmm0, %xmm1, %xmm0 {%k1}
 ; SKX-NEXT:    retq
-; SKX-NEXT:    ## -- End function
   %y = load <4 x float>, <4 x float>* %yp, align 4
   %mask = fcmp olt <4 x float> %x, %y
   %max = select <4 x i1> %mask, <4 x float> %x, <4 x float> %x1
@@ -1048,14 +1014,12 @@ define <8 x float> @test35(<8 x float> %
 ; KNL-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
 ; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; KNL-NEXT:    retq
-; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test35:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpltps (%rdi), %ymm0, %k1
 ; SKX-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
 ; SKX-NEXT:    retq
-; SKX-NEXT:    ## -- End function
 
   %y = load <8 x float>, <8 x float>* %yp, align 4
   %mask = fcmp ogt <8 x float> %y, %x
@@ -1069,7 +1033,6 @@ define <16 x float> @test36(<16 x float>
 ; CHECK-NEXT:    vcmpltps (%rdi), %zmm0, %k1
 ; CHECK-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
   %y = load <16 x float>, <16 x float>* %yp, align 4
   %mask = fcmp olt <16 x float> %x, %y
   %max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %x1
@@ -1082,7 +1045,6 @@ define <8 x double> @test37(<8 x double>
 ; CHECK-NEXT:    vcmpltpd (%rdi){1to8}, %zmm0, %k1
 ; CHECK-NEXT:    vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
 
   %a = load double, double* %ptr
   %v = insertelement <8 x double> undef, double %a, i32 0
@@ -1100,14 +1062,12 @@ define <4 x double> @test38(<4 x double>
 ; KNL-NEXT:    vcmpltpd %ymm2, %ymm0, %ymm2
 ; KNL-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
 ; KNL-NEXT:    retq
-; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test38:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpltpd (%rdi){1to4}, %ymm0, %k1
 ; SKX-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
 ; SKX-NEXT:    retq
-; SKX-NEXT:    ## -- End function
 
   %a = load double, double* %ptr
   %v = insertelement <4 x double> undef, double %a, i32 0
@@ -1125,14 +1085,12 @@ define <2 x double> @test39(<2 x double>
 ; KNL-NEXT:    vcmpltpd %xmm2, %xmm0, %xmm2
 ; KNL-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; KNL-NEXT:    retq
-; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test39:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpltpd (%rdi){1to2}, %xmm0, %k1
 ; SKX-NEXT:    vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
 ; SKX-NEXT:    retq
-; SKX-NEXT:    ## -- End function
 
   %a = load double, double* %ptr
   %v = insertelement <2 x double> undef, double %a, i32 0
@@ -1150,7 +1108,6 @@ define <16  x float> @test40(<16  x floa
 ; CHECK-NEXT:    vcmpltps (%rdi){1to16}, %zmm0, %k1
 ; CHECK-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
-; CHECK-NEXT:    ## -- End function
 
   %a = load float, float* %ptr
   %v = insertelement <16  x float> undef, float %a, i32 0
@@ -1171,14 +1128,12 @@ define <8  x float> @test41(<8  x float>
 ; KNL-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
 ; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; KNL-NEXT:    retq
-; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test41:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpltps (%rdi){1to8}, %ymm0, %k1
 ; SKX-NEXT:    vblendmps %ymm0, %ymm1, %ymm0 {%k1}
 ; SKX-NEXT:    retq
-; SKX-NEXT:    ## -- End function
 
   %a = load float, float* %ptr
   %v = insertelement <8  x float> undef, float %a, i32 0
@@ -1196,14 +1151,12 @@ define <4  x float> @test42(<4  x float>
 ; KNL-NEXT:    vcmpltps %xmm2, %xmm0, %xmm2
 ; KNL-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; KNL-NEXT:    retq
-; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test42:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vcmpltps (%rdi){1to4}, %xmm0, %k1
 ; SKX-NEXT:    vblendmps %xmm0, %xmm1, %xmm0 {%k1}
 ; SKX-NEXT:    retq
-; SKX-NEXT:    ## -- End function
 
   %a = load float, float* %ptr
   %v = insertelement <4  x float> undef, float %a, i32 0
@@ -1223,7 +1176,6 @@ define <8 x double> @test43(<8 x double>
 ; KNL-NEXT:    vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1}
 ; KNL-NEXT:    vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
 ; KNL-NEXT:    retq
-; KNL-NEXT:    ## -- End function
 ;
 ; SKX-LABEL: test43:
 ; SKX:       ## BB#0:
@@ -1232,7 +1184,6 @@ define <8 x double> @test43(<8 x double>
 ; SKX-NEXT:    vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1}
 ; SKX-NEXT:    vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
 ; SKX-NEXT:    retq
-; SKX-NEXT:    ## -- End function
 
   %a = load double, double* %ptr
   %v = insertelement <8 x double> undef, double %a, i32 0

Modified: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll?rev=314306&r1=314305&r2=314306&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll Wed Sep 27 07:44:15 2017
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
-; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32
 
 
 declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64)
@@ -11,8 +11,8 @@ declare <64 x i8> @llvm.x86.avx512.mask.
 ; AVX512BW-NEXT:    vpbroadcastb %edi, %zmm1
 ; AVX512BW-NEXT:    kmovq %rsi, %k1
 ; AVX512BW-NEXT:    vpbroadcastb %edi, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vpbroadcastb %edi, %zmm1 {%k1} {z}
+; AVX512BW-NEXT:    vpbroadcastb %edi, %zmm2 {%k1} {z}
+; AVX512BW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -22,8 +22,8 @@ declare <64 x i8> @llvm.x86.avx512.mask.
 ; AVX512F-32-NEXT:    vpbroadcastb %eax, %zmm1
 ; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpbroadcastb %eax, %zmm0 {%k1}
-; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastb %eax, %zmm1 {%k1} {z}
+; AVX512F-32-NEXT:    vpbroadcastb %eax, %zmm2 {%k1} {z}
+; AVX512F-32-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
 ; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
     %res = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 -1)
@@ -41,8 +41,8 @@ declare <32 x i16> @llvm.x86.avx512.mask
 ; AVX512BW-NEXT:    vpbroadcastw %edi, %zmm1
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpbroadcastw %edi, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT:    vpbroadcastw %edi, %zmm1 {%k1} {z}
+; AVX512BW-NEXT:    vpbroadcastw %edi, %zmm2 {%k1} {z}
+; AVX512BW-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -52,8 +52,8 @@ declare <32 x i16> @llvm.x86.avx512.mask
 ; AVX512F-32-NEXT:    vpbroadcastw %eax, %zmm1
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpbroadcastw %eax, %zmm0 {%k1}
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT:    vpbroadcastw %eax, %zmm1 {%k1} {z}
+; AVX512F-32-NEXT:    vpbroadcastw %eax, %zmm2 {%k1} {z}
+; AVX512F-32-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
 ; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
     %res = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 -1)
@@ -72,6 +72,7 @@ define void at test_int_x86_avx512_mask_sto
 ; AVX512BW-NEXT:    kmovq %rdx, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm0, (%rdi) {%k1}
 ; AVX512BW-NEXT:    vmovdqu32 %zmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_storeu_b_512:
@@ -81,6 +82,7 @@ define void at test_int_x86_avx512_mask_sto
 ; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vmovdqu8 %zmm0, (%ecx) {%k1}
 ; AVX512F-32-NEXT:    vmovdqu32 %zmm0, (%eax)
+; AVX512F-32-NEXT:    vzeroupper
 ; AVX512F-32-NEXT:    retl
   call void @llvm.x86.avx512.mask.storeu.b.512(i8* %ptr1, <64 x i8> %x1, i64 %x2)
   call void @llvm.x86.avx512.mask.storeu.b.512(i8* %ptr2, <64 x i8> %x1, i64 -1)
@@ -95,6 +97,7 @@ define void at test_int_x86_avx512_mask_sto
 ; AVX512BW-NEXT:    kmovd %edx, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm0, (%rdi) {%k1}
 ; AVX512BW-NEXT:    vmovdqu32 %zmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_storeu_w_512:
@@ -104,6 +107,7 @@ define void at test_int_x86_avx512_mask_sto
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vmovdqu16 %zmm0, (%ecx) {%k1}
 ; AVX512F-32-NEXT:    vmovdqu32 %zmm0, (%eax)
+; AVX512F-32-NEXT:    vzeroupper
 ; AVX512F-32-NEXT:    retl
   call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr1, <32 x i16> %x1, i32 %x2)
   call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr2, <32 x i16> %x1, i32 -1)
@@ -253,8 +257,8 @@ define <64 x i8>@test_int_x86_avx512_mas
 ; AVX512BW-NEXT:    kmovq %rdi, %k1
 ; AVX512BW-NEXT:    vpalignr {{.*#+}} zmm2 {%k1} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
 ; AVX512BW-NEXT:    vpalignr {{.*#+}} zmm0 {%k1} {z} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
-; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_palignr_512:
@@ -263,8 +267,8 @@ define <64 x i8>@test_int_x86_avx512_mas
 ; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpalignr {{.*#+}} zmm2 {%k1} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
 ; AVX512F-32-NEXT:    vpalignr {{.*#+}} zmm0 {%k1} {z} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
-; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512F-32-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddb %zmm0, %zmm2, %zmm0
 ; AVX512F-32-NEXT:    retl
   %res = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 %x4)
   %res1 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> zeroinitializer, i64 %x4)
@@ -283,8 +287,8 @@ define <32 x i16>@test_int_x86_avx512_ma
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
 ; AVX512BW-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pshufh_w_512:
@@ -293,8 +297,8 @@ define <32 x i16>@test_int_x86_avx512_ma
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
 ; AVX512F-32-NEXT:    vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3)
@@ -313,8 +317,8 @@ define <32 x i16>@test_int_x86_avx512_ma
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
 ; AVX512BW-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pshufl_w_512:
@@ -323,8 +327,8 @@ define <32 x i16>@test_int_x86_avx512_ma
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
 ; AVX512F-32-NEXT:    vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3)
@@ -339,6 +343,7 @@ define i64 @test_pcmpeq_b(<64 x i8> %a,
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_pcmpeq_b:
@@ -351,6 +356,7 @@ define i64 @test_pcmpeq_b(<64 x i8> %a,
 ; AVX512F-32-NEXT:    movl (%esp), %eax
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    addl $12, %esp
+; AVX512F-32-NEXT:    vzeroupper
 ; AVX512F-32-NEXT:    retl
   %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
   ret i64 %res
@@ -362,6 +368,7 @@ define i64 @test_mask_pcmpeq_b(<64 x i8>
 ; AVX512BW-NEXT:    kmovq %rdi, %k1
 ; AVX512BW-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
 ; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_mask_pcmpeq_b:
@@ -375,6 +382,7 @@ define i64 @test_mask_pcmpeq_b(<64 x i8>
 ; AVX512F-32-NEXT:    movl (%esp), %eax
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    addl $12, %esp
+; AVX512F-32-NEXT:    vzeroupper
 ; AVX512F-32-NEXT:    retl
   %res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
   ret i64 %res
@@ -387,12 +395,14 @@ define i32 @test_pcmpeq_w(<32 x i16> %a,
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_pcmpeq_w:
 ; AVX512F-32:       # BB#0:
 ; AVX512F-32-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
 ; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    vzeroupper
 ; AVX512F-32-NEXT:    retl
   %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
   ret i32 %res
@@ -404,6 +414,7 @@ define i32 @test_mask_pcmpeq_w(<32 x i16
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
 ; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_mask_pcmpeq_w:
@@ -411,6 +422,7 @@ define i32 @test_mask_pcmpeq_w(<32 x i16
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
 ; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    vzeroupper
 ; AVX512F-32-NEXT:    retl
   %res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
   ret i32 %res
@@ -423,6 +435,7 @@ define i64 @test_pcmpgt_b(<64 x i8> %a,
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_pcmpgt_b:
@@ -435,6 +448,7 @@ define i64 @test_pcmpgt_b(<64 x i8> %a,
 ; AVX512F-32-NEXT:    movl (%esp), %eax
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    addl $12, %esp
+; AVX512F-32-NEXT:    vzeroupper
 ; AVX512F-32-NEXT:    retl
   %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
   ret i64 %res
@@ -446,6 +460,7 @@ define i64 @test_mask_pcmpgt_b(<64 x i8>
 ; AVX512BW-NEXT:    kmovq %rdi, %k1
 ; AVX512BW-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 {%k1}
 ; AVX512BW-NEXT:    kmovq %k0, %rax
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_mask_pcmpgt_b:
@@ -459,6 +474,7 @@ define i64 @test_mask_pcmpgt_b(<64 x i8>
 ; AVX512F-32-NEXT:    movl (%esp), %eax
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    addl $12, %esp
+; AVX512F-32-NEXT:    vzeroupper
 ; AVX512F-32-NEXT:    retl
   %res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
   ret i64 %res
@@ -471,12 +487,14 @@ define i32 @test_pcmpgt_w(<32 x i16> %a,
 ; AVX512BW:       ## BB#0:
 ; AVX512BW-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_pcmpgt_w:
 ; AVX512F-32:       # BB#0:
 ; AVX512F-32-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
 ; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    vzeroupper
 ; AVX512F-32-NEXT:    retl
   %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
   ret i32 %res
@@ -488,6 +506,7 @@ define i32 @test_mask_pcmpgt_w(<32 x i16
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
 ; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_mask_pcmpgt_w:
@@ -495,6 +514,7 @@ define i32 @test_mask_pcmpgt_w(<32 x i16
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
 ; AVX512F-32-NEXT:    kmovd %k0, %eax
+; AVX512F-32-NEXT:    vzeroupper
 ; AVX512F-32-NEXT:    retl
   %res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
   ret i32 %res
@@ -799,8 +819,8 @@ define <32 x i16>@test_int_x86_avx512_ma
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
 ; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovzxb_w_512:
@@ -809,8 +829,8 @@ define <32 x i16>@test_int_x86_avx512_ma
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpmovzxbw {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
 ; AVX512F-32-NEXT:    vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> zeroinitializer, i32 %x2)
@@ -829,8 +849,8 @@ define <32 x i16>@test_int_x86_avx512_ma
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovsxb_w_512:
@@ -839,8 +859,8 @@ define <32 x i16>@test_int_x86_avx512_ma
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpmovsxbw %ymm0, %zmm1 {%k1}
 ; AVX512F-32-NEXT:    vpmovsxbw %ymm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> zeroinitializer, i32 %x2)
@@ -858,8 +878,8 @@ define <32 x i16>@test_int_x86_avx512_ma
 ; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm3
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpaddw %zmm3, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm3, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -868,8 +888,8 @@ define <32 x i16>@test_int_x86_avx512_ma
 ; AVX512F-32-NEXT:    vpsrlw %xmm1, %zmm0, %zmm3
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpsrlw %xmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm2, %zmm2
 ; AVX512F-32-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm3, %zmm0
 ; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
@@ -888,8 +908,8 @@ define <32 x i16>@test_int_x86_avx512_ma
 ; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm2
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT:    vpaddw %zmm2, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
@@ -898,8 +918,8 @@ define <32 x i16>@test_int_x86_avx512_ma
 ; AVX512F-32-NEXT:    vpsrlw $3, %zmm0, %zmm2
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpsrlw $3, %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT:    vpaddw %zmm2, %zmm1, %zmm1
 ; AVX512F-32-NEXT:    vpsrlw $3, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
@@ -919,8 +939,8 @@ define <32 x i16>@test_int_x86_avx512_ma
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vpsraw %xmm1, %zmm0, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vpsraw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_psra_w_512:
@@ -929,8 +949,8 @@ define <32 x i16>@test_int_x86_avx512_ma
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpsraw %xmm1, %zmm0, %zmm2 {%k1}
 ; AVX512F-32-NEXT:    vpsraw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
@@ -949,8 +969,8 @@ define <32 x i16>@test_int_x86_avx512_ma
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpsraw $3, %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vpsraw $3, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_psra_wi_512:
@@ -959,8 +979,8 @@ define <32 x i16>@test_int_x86_avx512_ma
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpsraw $3, %zmm0, %zmm1 {%k1}
 ; AVX512F-32-NEXT:    vpsraw $3, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3)
@@ -979,8 +999,8 @@ define <32 x i16>@test_int_x86_avx512_ma
 ; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_psll_w_512:
@@ -989,8 +1009,8 @@ define <32 x i16>@test_int_x86_avx512_ma
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpsllw %xmm1, %zmm0, %zmm2 {%k1}
 ; AVX512F-32-NEXT:    vpsllw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512F-32-NEXT:    vpaddw %zmm3, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm2, %zmm0
 ; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
@@ -1009,8 +1029,8 @@ define <32 x i16>@test_int_x86_avx512_ma
 ; AVX512BW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-NEXT:    vpsllw $3, %zmm0, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vpsllw $3, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_psll_wi_512:
@@ -1019,8 +1039,8 @@ define <32 x i16>@test_int_x86_avx512_ma
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpsllw $3, %zmm0, %zmm1 {%k1}
 ; AVX512F-32-NEXT:    vpsllw $3, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    vpaddw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpaddw %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
   %res = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
   %res1 = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3)
@@ -1650,6 +1670,7 @@ define i64 @test_cmp_b_512(<64 x i8> %a0
 ; AVX512BW-NEXT:    kxnorq %k0, %k0, %k0
 ; AVX512BW-NEXT:    kmovq %k0, %rax
 ; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_cmp_b_512:
@@ -1664,28 +1685,29 @@ define i64 @test_cmp_b_512(<64 x i8> %a0
 ; AVX512F-32-NEXT:    vpcmpgtb %zmm0, %zmm1, %k0
 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    vpcmpleb %zmm1, %zmm0, %k0
 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0
 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    vpcmpleb %zmm0, %zmm1, %k0
 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0
 ; AVX512F-32-NEXT:    kmovq %k0, (%esp)
 ; AVX512F-32-NEXT:    addl (%esp), %eax
-; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    kxnorq %k0, %k0, %k0
 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    addl $60, %esp
+; AVX512F-32-NEXT:    vzeroupper
 ; AVX512F-32-NEXT:    retl
   %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
   %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
@@ -1730,33 +1752,26 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512BW-NEXT:    kmovq %k0, %rax
 ; AVX512BW-NEXT:    addq %rcx, %rax
 ; AVX512BW-NEXT:    addq %rdi, %rax
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_mask_cmp_b_512:
 ; AVX512F-32:       # BB#0:
-; AVX512F-32-NEXT:    pushl %ebp
+; AVX512F-32-NEXT:    pushl %ebx
 ; AVX512F-32-NEXT:  .Lcfi5:
 ; AVX512F-32-NEXT:    .cfi_def_cfa_offset 8
-; AVX512F-32-NEXT:    pushl %ebx
+; AVX512F-32-NEXT:    pushl %esi
 ; AVX512F-32-NEXT:  .Lcfi6:
 ; AVX512F-32-NEXT:    .cfi_def_cfa_offset 12
-; AVX512F-32-NEXT:    pushl %edi
+; AVX512F-32-NEXT:    subl $60, %esp
 ; AVX512F-32-NEXT:  .Lcfi7:
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
-; AVX512F-32-NEXT:    pushl %esi
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 72
 ; AVX512F-32-NEXT:  .Lcfi8:
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 20
-; AVX512F-32-NEXT:    subl $60, %esp
+; AVX512F-32-NEXT:    .cfi_offset %esi, -12
 ; AVX512F-32-NEXT:  .Lcfi9:
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 80
-; AVX512F-32-NEXT:  .Lcfi10:
-; AVX512F-32-NEXT:    .cfi_offset %esi, -20
-; AVX512F-32-NEXT:  .Lcfi11:
-; AVX512F-32-NEXT:    .cfi_offset %edi, -16
-; AVX512F-32-NEXT:  .Lcfi12:
-; AVX512F-32-NEXT:    .cfi_offset %ebx, -12
-; AVX512F-32-NEXT:  .Lcfi13:
-; AVX512F-32-NEXT:    .cfi_offset %ebp, -8
+; AVX512F-32-NEXT:    .cfi_offset %ebx, -8
+; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512F-32-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrb $5, %al
@@ -1777,39 +1792,39 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    kmovd %ecx, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
 ; AVX512F-32-NEXT:    kmovd %edx, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
 ; AVX512F-32-NEXT:    vpslld $24, %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
 ; AVX512F-32-NEXT:    kmovd %ebx, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
 ; AVX512F-32-NEXT:    vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
 ; AVX512F-32-NEXT:    kmovd %eax, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
 ; AVX512F-32-NEXT:    vpsllq $40, %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrb $6, %al
@@ -1818,8 +1833,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
@@ -1828,8 +1843,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %ch, %al
@@ -1837,8 +1852,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    andb $2, %al
@@ -1847,8 +1862,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %ch, %dl
@@ -1859,8 +1874,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    shrb $3, %al
@@ -1868,631 +1883,639 @@ define i64 @test_mask_cmp_b_512(<64 x i8
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl $1036, %edi # imm = 0x40C
-; AVX512F-32-NEXT:    bextrl %edi, %ecx, %eax
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrl $12, %eax
+; AVX512F-32-NEXT:    andl $15, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    movl %ecx, %ebp
 ; AVX512F-32-NEXT:    shrl $13, %eax
 ; AVX512F-32-NEXT:    andb $1, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl $526, %edx # imm = 0x20E
-; AVX512F-32-NEXT:    bextrl %edx, %ebp, %eax
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrl $14, %eax
+; AVX512F-32-NEXT:    andl $3, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    movl $271, %esi # imm = 0x10F
-; AVX512F-32-NEXT:    bextrl %esi, %ebp, %eax
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrl $15, %eax
+; AVX512F-32-NEXT:    andl $1, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    shrl $16, %ecx
-; AVX512F-32-NEXT:    kmovd %ecx, %k1
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrl $16, %eax
+; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    andb $2, %al
-; AVX512F-32-NEXT:    shrb %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    movl %eax, %edx
+; AVX512F-32-NEXT:    andb $2, %dl
+; AVX512F-32-NEXT:    shrb %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %ebx
+; AVX512F-32-NEXT:    movl %eax, %ebx
 ; AVX512F-32-NEXT:    andb $15, %bl
-; AVX512F-32-NEXT:    movl %ebx, %eax
+; AVX512F-32-NEXT:    movl %ebx, %edx
 ; AVX512F-32-NEXT:    shrb $2, %bl
 ; AVX512F-32-NEXT:    kmovd %ebx, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    shrb $3, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    shrb $3, %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslld $24, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrb $4, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    movl %eax, %edx
+; AVX512F-32-NEXT:    shrb $4, %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrb $5, %al
-; AVX512F-32-NEXT:    andb $1, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    movl %eax, %edx
+; AVX512F-32-NEXT:    shrb $5, %dl
+; AVX512F-32-NEXT:    andb $1, %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpsllq $40, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrb $6, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    movl %eax, %edx
+; AVX512F-32-NEXT:    shrb $6, %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill> %ECX<def>
-; AVX512F-32-NEXT:    shrb $7, %cl
-; AVX512F-32-NEXT:    kmovd %ecx, %k1
+; AVX512F-32-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill> %EAX<def>
+; AVX512F-32-NEXT:    shrb $7, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ebp, %ebx
-; AVX512F-32-NEXT:    shrl $24, %ebx
-; AVX512F-32-NEXT:    kmovd %ebx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %ymm3
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm3, %k0
-; AVX512F-32-NEXT:    movl %ebx, %eax
-; AVX512F-32-NEXT:    andb $2, %al
-; AVX512F-32-NEXT:    shrb %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm4
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm5
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm5, %ymm4, %ymm4
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm4, %k0
-; AVX512F-32-NEXT:    # kill: %BL<def> %BL<kill> %EBX<kill> %EBX<def>
-; AVX512F-32-NEXT:    andb $15, %bl
-; AVX512F-32-NEXT:    movl %ebx, %eax
-; AVX512F-32-NEXT:    shrb $2, %bl
-; AVX512F-32-NEXT:    kmovd %ebx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm4
-; AVX512F-32-NEXT:    vpbroadcastw %xmm4, %xmm4
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm5
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm6
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm6, %ymm5, %ymm5
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm5, %k0
-; AVX512F-32-NEXT:    shrb $3, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm5
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm7
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm6, %ymm7, %ymm5, %ymm5
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm5, %k0
-; AVX512F-32-NEXT:    movl %ebp, %ecx
 ; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $28, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm5
-; AVX512F-32-NEXT:    vpbroadcastd %xmm5, %xmm5
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm7
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm7, %ymm5, %ymm5
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm5, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $29, %eax
-; AVX512F-32-NEXT:    andb $1, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm5
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm7
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm3, %ymm2, %ymm7, %ymm7
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ebp, %eax
-; AVX512F-32-NEXT:    shrl $30, %eax
+; AVX512F-32-NEXT:    shrl $24, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ebp, %eax
-; AVX512F-32-NEXT:    shrl $31, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    movl %eax, %edx
+; AVX512F-32-NEXT:    andb $2, %dl
+; AVX512F-32-NEXT:    shrb %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; AVX512F-32-NEXT:    kmovd %ebx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ebx, %eax
-; AVX512F-32-NEXT:    andb $2, %al
-; AVX512F-32-NEXT:    shrb %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ebx, %ecx
-; AVX512F-32-NEXT:    andb $15, %cl
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrb $2, %cl
-; AVX512F-32-NEXT:    kmovd %ecx, %k1
+; AVX512F-32-NEXT:    movl %eax, %edx
+; AVX512F-32-NEXT:    andb $15, %dl
+; AVX512F-32-NEXT:    movl %edx, %eax
+; AVX512F-32-NEXT:    shrb $2, %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    shrb $3, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslld $24, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ebx, %eax
-; AVX512F-32-NEXT:    shrb $4, %al
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrl $28, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ebx, %eax
-; AVX512F-32-NEXT:    shrb $5, %al
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    movl %ecx, %esi
+; AVX512F-32-NEXT:    shrl $29, %eax
 ; AVX512F-32-NEXT:    andb $1, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpsllq $40, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ebx, %eax
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm1, %ymm0, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %esi, %eax
+; AVX512F-32-NEXT:    shrl $30, %eax
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm1
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %esi, %eax
+; AVX512F-32-NEXT:    shrl $31, %eax
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT:    kmovd %ecx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm7
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm7, %ymm1
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    andb $2, %al
+; AVX512F-32-NEXT:    shrb %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ecx, %edx
+; AVX512F-32-NEXT:    andb $15, %dl
+; AVX512F-32-NEXT:    movl %edx, %eax
+; AVX512F-32-NEXT:    shrb $2, %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    shrb $3, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrb $4, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpbroadcastd %xmm0, %xmm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrb $5, %al
+; AVX512F-32-NEXT:    andb $1, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpsllq $40, %xmm0, %xmm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrb $6, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ebx, %eax
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrb $7, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movb %bh, %al
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movb %ch, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
 ; AVX512F-32-NEXT:    andb $2, %al
 ; AVX512F-32-NEXT:    shrb %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movb %bh, %cl
-; AVX512F-32-NEXT:    andb $15, %cl
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrb $2, %cl
-; AVX512F-32-NEXT:    kmovd %ecx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movb %ch, %dl
+; AVX512F-32-NEXT:    andb $15, %dl
+; AVX512F-32-NEXT:    movl %edx, %eax
+; AVX512F-32-NEXT:    shrb $2, %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
 ; AVX512F-32-NEXT:    shrb $3, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    bextrl %edi, %ebx, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ebx, %eax
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrl $12, %eax
+; AVX512F-32-NEXT:    andl $15, %eax
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpbroadcastd %xmm0, %xmm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $13, %eax
 ; AVX512F-32-NEXT:    andb $1, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    bextrl %edx, %ebx, %eax
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrl $14, %eax
+; AVX512F-32-NEXT:    andl $3, %eax
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrl $15, %eax
+; AVX512F-32-NEXT:    andl $1, %eax
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ecx, %ebx
+; AVX512F-32-NEXT:    shrl $16, %ebx
+; AVX512F-32-NEXT:    kmovd %ebx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ebx, %edx
+; AVX512F-32-NEXT:    andb $2, %dl
+; AVX512F-32-NEXT:    shrb %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    movl %ebx, %eax
+; AVX512F-32-NEXT:    andb $15, %al
+; AVX512F-32-NEXT:    movl %eax, %edx
+; AVX512F-32-NEXT:    shrb $2, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    shrb $3, %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ebx, %eax
+; AVX512F-32-NEXT:    shrb $4, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    bextrl %esi, %ebx, %eax
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpbroadcastd %xmm0, %xmm0
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
 ; AVX512F-32-NEXT:    movl %ebx, %eax
-; AVX512F-32-NEXT:    shrl $16, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %eax, %ecx
-; AVX512F-32-NEXT:    andb $2, %cl
-; AVX512F-32-NEXT:    shrb %cl
-; AVX512F-32-NEXT:    kmovd %ecx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    movl %eax, %ecx
-; AVX512F-32-NEXT:    andb $15, %cl
-; AVX512F-32-NEXT:    movl %ecx, %edx
-; AVX512F-32-NEXT:    shrb $2, %cl
-; AVX512F-32-NEXT:    kmovd %ecx, %k0
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    shrb $3, %dl
-; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslld $24, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %eax, %ecx
-; AVX512F-32-NEXT:    shrb $4, %cl
-; AVX512F-32-NEXT:    kmovd %ecx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %eax, %ecx
-; AVX512F-32-NEXT:    shrb $5, %cl
-; AVX512F-32-NEXT:    andb $1, %cl
-; AVX512F-32-NEXT:    kmovd %ecx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpsllq $40, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %eax, %ecx
-; AVX512F-32-NEXT:    shrb $6, %cl
-; AVX512F-32-NEXT:    kmovd %ecx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill> %EAX<def>
-; AVX512F-32-NEXT:    shrb $7, %al
+; AVX512F-32-NEXT:    shrb $5, %al
+; AVX512F-32-NEXT:    andb $1, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpsllq $40, %xmm0, %xmm0
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
 ; AVX512F-32-NEXT:    movl %ebx, %eax
-; AVX512F-32-NEXT:    shrl $24, %eax
+; AVX512F-32-NEXT:    shrb $6, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %eax, %ecx
-; AVX512F-32-NEXT:    andb $2, %cl
-; AVX512F-32-NEXT:    shrb %cl
-; AVX512F-32-NEXT:    kmovd %ecx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    movl %eax, %ecx
-; AVX512F-32-NEXT:    andb $15, %cl
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    # kill: %BL<def> %BL<kill> %EBX<kill> %EBX<def>
+; AVX512F-32-NEXT:    shrb $7, %bl
+; AVX512F-32-NEXT:    kmovd %ebx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrb $2, %cl
-; AVX512F-32-NEXT:    kmovd %ecx, %k0
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    shrl $24, %eax
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpbroadcastq %xmm0, %ymm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %eax, %edx
+; AVX512F-32-NEXT:    andb $2, %dl
+; AVX512F-32-NEXT:    shrb %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    movl %eax, %edx
+; AVX512F-32-NEXT:    andb $15, %dl
+; AVX512F-32-NEXT:    movl %edx, %eax
+; AVX512F-32-NEXT:    shrb $2, %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
 ; AVX512F-32-NEXT:    shrb $3, %al
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
 ; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm4
-; AVX512F-32-NEXT:    vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
-; AVX512F-32-NEXT:    movl %ebx, %eax
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
+; AVX512F-32-NEXT:    vpblendvb %ymm3, %ymm4, %ymm1, %ymm1
+; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $29, %eax
 ; AVX512F-32-NEXT:    andb $1, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    movl %ebx, %eax
+; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $28, %eax
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm4
-; AVX512F-32-NEXT:    vpbroadcastd %xmm4, %xmm4
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
-; AVX512F-32-NEXT:    movl %ebx, %eax
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT:    vpbroadcastd %xmm3, %xmm3
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $30, %eax
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT:    vpbroadcastw %xmm4, %xmm4
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm3, %ymm4, %ymm3
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ebx, %eax
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vpbroadcastw %xmm3, %xmm3
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $31, %eax
 ; AVX512F-32-NEXT:    kshiftlq $1, %k0, %k0
 ; AVX512F-32-NEXT:    kshiftrq $1, %k0, %k0
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
 ; AVX512F-32-NEXT:    korq %k1, %k0, %k1
-; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    vpcmpeqb %zmm6, %zmm5, %k0 {%k1}
+; AVX512F-32-NEXT:    vpcmpgtb %zmm5, %zmm6, %k2 {%k1}
+; AVX512F-32-NEXT:    vpcmpleb %zmm6, %zmm5, %k3 {%k1}
+; AVX512F-32-NEXT:    vpcmpneqb %zmm6, %zmm5, %k4 {%k1}
+; AVX512F-32-NEXT:    vpcmpleb %zmm5, %zmm6, %k5 {%k1}
+; AVX512F-32-NEXT:    vpcmpgtb %zmm6, %zmm5, %k1 {%k1}
 ; AVX512F-32-NEXT:    kmovq %k0, (%esp)
 ; AVX512F-32-NEXT:    movl (%esp), %eax
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    vpcmpgtb %zmm0, %zmm1, %k0 {%k1}
-; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    kmovq %k2, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    vpcmpleb %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    kmovq %k3, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    kxorq %k0, %k0, %k0
 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    orl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT:    vpcmpleb %zmm0, %zmm1, %k2 {%k1}
-; AVX512F-32-NEXT:    vpcmpgtb %zmm1, %zmm0, %k1 {%k1}
-; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    kmovq %k4, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    kmovq %k2, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    kmovq %k5, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    kmovq %k1, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    addl %ebp, %eax
-; AVX512F-32-NEXT:    adcxl %ebx, %edx
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl %esi, %eax
+; AVX512F-32-NEXT:    adcl %ecx, %edx
 ; AVX512F-32-NEXT:    addl $60, %esp
 ; AVX512F-32-NEXT:    popl %esi
-; AVX512F-32-NEXT:    popl %edi
 ; AVX512F-32-NEXT:    popl %ebx
-; AVX512F-32-NEXT:    popl %ebp
+; AVX512F-32-NEXT:    vzeroupper
 ; AVX512F-32-NEXT:    retl
   %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
   %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
@@ -2537,12 +2560,13 @@ define i64 @test_ucmp_b_512(<64 x i8> %a
 ; AVX512BW-NEXT:    kxnorq %k0, %k0, %k0
 ; AVX512BW-NEXT:    kmovq %k0, %rax
 ; AVX512BW-NEXT:    addq %rcx, %rax
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_ucmp_b_512:
 ; AVX512F-32:       # BB#0:
 ; AVX512F-32-NEXT:    subl $60, %esp
-; AVX512F-32-NEXT:  .Lcfi14:
+; AVX512F-32-NEXT:  .Lcfi10:
 ; AVX512F-32-NEXT:    .cfi_def_cfa_offset 64
 ; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0
 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
@@ -2551,28 +2575,29 @@ define i64 @test_ucmp_b_512(<64 x i8> %a
 ; AVX512F-32-NEXT:    vpcmpltub %zmm1, %zmm0, %k0
 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    vpcmpleub %zmm1, %zmm0, %k0
 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0
 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    vpcmpnltub %zmm1, %zmm0, %k0
 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0
 ; AVX512F-32-NEXT:    kmovq %k0, (%esp)
 ; AVX512F-32-NEXT:    addl (%esp), %eax
-; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    kxnorq %k0, %k0, %k0
 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    addl $60, %esp
+; AVX512F-32-NEXT:    vzeroupper
 ; AVX512F-32-NEXT:    retl
   %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
   %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
@@ -2617,33 +2642,26 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512BW-NEXT:    kmovq %k0, %rax
 ; AVX512BW-NEXT:    addq %rcx, %rax
 ; AVX512BW-NEXT:    addq %rdi, %rax
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_mask_x86_avx512_ucmp_b_512:
 ; AVX512F-32:       # BB#0:
-; AVX512F-32-NEXT:    pushl %ebp
-; AVX512F-32-NEXT:  .Lcfi15:
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 8
 ; AVX512F-32-NEXT:    pushl %ebx
-; AVX512F-32-NEXT:  .Lcfi16:
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 12
-; AVX512F-32-NEXT:    pushl %edi
-; AVX512F-32-NEXT:  .Lcfi17:
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT:  .Lcfi11:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 8
 ; AVX512F-32-NEXT:    pushl %esi
-; AVX512F-32-NEXT:  .Lcfi18:
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 20
+; AVX512F-32-NEXT:  .Lcfi12:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 12
 ; AVX512F-32-NEXT:    subl $60, %esp
-; AVX512F-32-NEXT:  .Lcfi19:
-; AVX512F-32-NEXT:    .cfi_def_cfa_offset 80
-; AVX512F-32-NEXT:  .Lcfi20:
-; AVX512F-32-NEXT:    .cfi_offset %esi, -20
-; AVX512F-32-NEXT:  .Lcfi21:
-; AVX512F-32-NEXT:    .cfi_offset %edi, -16
-; AVX512F-32-NEXT:  .Lcfi22:
-; AVX512F-32-NEXT:    .cfi_offset %ebx, -12
-; AVX512F-32-NEXT:  .Lcfi23:
-; AVX512F-32-NEXT:    .cfi_offset %ebp, -8
+; AVX512F-32-NEXT:  .Lcfi13:
+; AVX512F-32-NEXT:    .cfi_def_cfa_offset 72
+; AVX512F-32-NEXT:  .Lcfi14:
+; AVX512F-32-NEXT:    .cfi_offset %esi, -12
+; AVX512F-32-NEXT:  .Lcfi15:
+; AVX512F-32-NEXT:    .cfi_offset %ebx, -8
+; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm6
+; AVX512F-32-NEXT:    vmovdqa64 %zmm0, %zmm5
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrb $5, %al
@@ -2664,39 +2682,39 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    kmovd %ecx, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
 ; AVX512F-32-NEXT:    kmovd %edx, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
 ; AVX512F-32-NEXT:    vpslld $24, %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
 ; AVX512F-32-NEXT:    kmovd %ebx, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
 ; AVX512F-32-NEXT:    vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
 ; AVX512F-32-NEXT:    kmovd %eax, %k0
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
 ; AVX512F-32-NEXT:    vpsllq $40, %xmm3, %xmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
 ; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrb $6, %al
@@ -2705,8 +2723,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
@@ -2715,8 +2733,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %ch, %al
@@ -2724,8 +2742,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    andb $2, %al
@@ -2734,8 +2752,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movb %ch, %dl
@@ -2746,8 +2764,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    shrb $3, %al
@@ -2755,631 +2773,639 @@ define i64 @test_mask_x86_avx512_ucmp_b_
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl $1036, %edi # imm = 0x40C
-; AVX512F-32-NEXT:    bextrl %edi, %ecx, %eax
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrl $12, %eax
+; AVX512F-32-NEXT:    andl $15, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    movl %ecx, %ebp
 ; AVX512F-32-NEXT:    shrl $13, %eax
 ; AVX512F-32-NEXT:    andb $1, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl $526, %edx # imm = 0x20E
-; AVX512F-32-NEXT:    bextrl %edx, %ebp, %eax
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrl $14, %eax
+; AVX512F-32-NEXT:    andl $3, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT:    movl $271, %esi # imm = 0x10F
-; AVX512F-32-NEXT:    bextrl %esi, %ebp, %eax
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrl $15, %eax
+; AVX512F-32-NEXT:    andl $1, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    shrl $16, %ecx
-; AVX512F-32-NEXT:    kmovd %ecx, %k1
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrl $16, %eax
+; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    andb $2, %al
-; AVX512F-32-NEXT:    shrb %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    movl %eax, %edx
+; AVX512F-32-NEXT:    andb $2, %dl
+; AVX512F-32-NEXT:    shrb %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %ebx
+; AVX512F-32-NEXT:    movl %eax, %ebx
 ; AVX512F-32-NEXT:    andb $15, %bl
-; AVX512F-32-NEXT:    movl %ebx, %eax
+; AVX512F-32-NEXT:    movl %ebx, %edx
 ; AVX512F-32-NEXT:    shrb $2, %bl
 ; AVX512F-32-NEXT:    kmovd %ebx, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    shrb $3, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    shrb $3, %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpslld $24, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrb $4, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    movl %eax, %edx
+; AVX512F-32-NEXT:    shrb $4, %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrb $5, %al
-; AVX512F-32-NEXT:    andb $1, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    movl %eax, %edx
+; AVX512F-32-NEXT:    shrb $5, %dl
+; AVX512F-32-NEXT:    andb $1, %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpsllq $40, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrb $6, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    movl %eax, %edx
+; AVX512F-32-NEXT:    shrb $6, %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill> %ECX<def>
-; AVX512F-32-NEXT:    shrb $7, %cl
-; AVX512F-32-NEXT:    kmovd %ecx, %k1
+; AVX512F-32-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill> %EAX<def>
+; AVX512F-32-NEXT:    shrb $7, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ebp, %ebx
-; AVX512F-32-NEXT:    shrl $24, %ebx
-; AVX512F-32-NEXT:    kmovd %ebx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %ymm3
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm3, %k0
-; AVX512F-32-NEXT:    movl %ebx, %eax
-; AVX512F-32-NEXT:    andb $2, %al
-; AVX512F-32-NEXT:    shrb %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm4
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm5
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm5, %ymm4, %ymm4
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm4, %k0
-; AVX512F-32-NEXT:    # kill: %BL<def> %BL<kill> %EBX<kill> %EBX<def>
-; AVX512F-32-NEXT:    andb $15, %bl
-; AVX512F-32-NEXT:    movl %ebx, %eax
-; AVX512F-32-NEXT:    shrb $2, %bl
-; AVX512F-32-NEXT:    kmovd %ebx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm4
-; AVX512F-32-NEXT:    vpbroadcastw %xmm4, %xmm4
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm5
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm6
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm6, %ymm5, %ymm5
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm5, %k0
-; AVX512F-32-NEXT:    shrb $3, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm5
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm7
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm6, %ymm7, %ymm5, %ymm5
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm5, %k0
-; AVX512F-32-NEXT:    movl %ebp, %ecx
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $28, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm5
-; AVX512F-32-NEXT:    vpbroadcastd %xmm5, %xmm5
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm7
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm7, %ymm5, %ymm5
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm5, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrl $29, %eax
-; AVX512F-32-NEXT:    andb $1, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm5
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm7
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm3, %ymm2, %ymm7, %ymm7
-; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7]
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ebp, %eax
-; AVX512F-32-NEXT:    shrl $30, %eax
+; AVX512F-32-NEXT:    shrl $24, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ebp, %eax
-; AVX512F-32-NEXT:    shrl $31, %eax
-; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    movl %eax, %edx
+; AVX512F-32-NEXT:    andb $2, %dl
+; AVX512F-32-NEXT:    shrb %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
 ; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
-; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
 ; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; AVX512F-32-NEXT:    kmovd %ebx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ebx, %eax
-; AVX512F-32-NEXT:    andb $2, %al
-; AVX512F-32-NEXT:    shrb %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ebx, %ecx
-; AVX512F-32-NEXT:    andb $15, %cl
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrb $2, %cl
-; AVX512F-32-NEXT:    kmovd %ecx, %k1
+; AVX512F-32-NEXT:    movl %eax, %edx
+; AVX512F-32-NEXT:    andb $15, %dl
+; AVX512F-32-NEXT:    movl %edx, %eax
+; AVX512F-32-NEXT:    shrb $2, %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
 ; AVX512F-32-NEXT:    shrb $3, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslld $24, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ebx, %eax
-; AVX512F-32-NEXT:    shrb $4, %al
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrl $28, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
 ; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm0, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
 ; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ebx, %eax
-; AVX512F-32-NEXT:    shrb $5, %al
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    movl %ecx, %esi
+; AVX512F-32-NEXT:    shrl $29, %eax
 ; AVX512F-32-NEXT:    andb $1, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpsllq $40, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ebx, %eax
-; AVX512F-32-NEXT:    shrb $6, %al
-; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ebx, %eax
-; AVX512F-32-NEXT:    shrb $7, %al
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm1, %ymm0, %ymm2, %ymm2
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %esi, %eax
+; AVX512F-32-NEXT:    shrl $30, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movb %bh, %al
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm1
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %esi, %eax
+; AVX512F-32-NEXT:    shrl $31, %eax
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT:    kmovd %ecx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm7
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm7, %ymm1
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    andb $2, %al
 ; AVX512F-32-NEXT:    shrb %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movb %bh, %cl
-; AVX512F-32-NEXT:    andb $15, %cl
-; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrb $2, %cl
-; AVX512F-32-NEXT:    kmovd %ecx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ecx, %edx
+; AVX512F-32-NEXT:    andb $15, %dl
+; AVX512F-32-NEXT:    movl %edx, %eax
+; AVX512F-32-NEXT:    shrb $2, %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
 ; AVX512F-32-NEXT:    shrb $3, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    bextrl %edi, %ebx, %eax
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrb $4, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ebx, %eax
-; AVX512F-32-NEXT:    shrl $13, %eax
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpbroadcastd %xmm0, %xmm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrb $5, %al
 ; AVX512F-32-NEXT:    andb $1, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    bextrl %edx, %ebx, %eax
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpsllq $40, %xmm0, %xmm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrb $6, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    bextrl %esi, %ebx, %eax
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrb $7, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ebx, %eax
-; AVX512F-32-NEXT:    shrl $16, %eax
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movb %ch, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %eax, %ecx
-; AVX512F-32-NEXT:    andb $2, %cl
-; AVX512F-32-NEXT:    shrb %cl
-; AVX512F-32-NEXT:    kmovd %ecx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpsllw $8, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    movl %eax, %ecx
-; AVX512F-32-NEXT:    andb $15, %cl
-; AVX512F-32-NEXT:    movl %ecx, %edx
-; AVX512F-32-NEXT:    shrb $2, %cl
-; AVX512F-32-NEXT:    kmovd %ecx, %k0
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    andb $2, %al
+; AVX512F-32-NEXT:    shrb %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movb %ch, %dl
+; AVX512F-32-NEXT:    andb $15, %dl
+; AVX512F-32-NEXT:    movl %edx, %eax
+; AVX512F-32-NEXT:    shrb $2, %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    shrb $3, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrl $12, %eax
+; AVX512F-32-NEXT:    andl $15, %eax
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpbroadcastd %xmm0, %xmm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrl $13, %eax
+; AVX512F-32-NEXT:    andb $1, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrl $14, %eax
+; AVX512F-32-NEXT:    andl $3, %eax
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
+; AVX512F-32-NEXT:    shrl $15, %eax
+; AVX512F-32-NEXT:    andl $1, %eax
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ecx, %ebx
+; AVX512F-32-NEXT:    shrl $16, %ebx
+; AVX512F-32-NEXT:    kmovd %ebx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ebx, %edx
+; AVX512F-32-NEXT:    andb $2, %dl
+; AVX512F-32-NEXT:    shrb %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    movl %ebx, %eax
+; AVX512F-32-NEXT:    andb $15, %al
+; AVX512F-32-NEXT:    movl %eax, %edx
+; AVX512F-32-NEXT:    shrb $2, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
 ; AVX512F-32-NEXT:    shrb $3, %dl
 ; AVX512F-32-NEXT:    kmovd %edx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslld $24, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %eax, %ecx
-; AVX512F-32-NEXT:    shrb $4, %cl
-; AVX512F-32-NEXT:    kmovd %ecx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %eax, %ecx
-; AVX512F-32-NEXT:    shrb $5, %cl
-; AVX512F-32-NEXT:    andb $1, %cl
-; AVX512F-32-NEXT:    kmovd %ecx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpsllq $40, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %eax, %ecx
-; AVX512F-32-NEXT:    shrb $6, %cl
-; AVX512F-32-NEXT:    kmovd %ecx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill> %EAX<def>
-; AVX512F-32-NEXT:    shrb $7, %al
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ebx, %eax
+; AVX512F-32-NEXT:    shrb $4, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpsllq $56, %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpbroadcastd %xmm0, %xmm0
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
 ; AVX512F-32-NEXT:    movl %ebx, %eax
-; AVX512F-32-NEXT:    shrl $24, %eax
+; AVX512F-32-NEXT:    shrb $5, %al
+; AVX512F-32-NEXT:    andb $1, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastq %xmm2, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %eax, %ecx
-; AVX512F-32-NEXT:    andb $2, %cl
-; AVX512F-32-NEXT:    shrb %cl
-; AVX512F-32-NEXT:    kmovd %ecx, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    movl %eax, %ecx
-; AVX512F-32-NEXT:    andb $15, %cl
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpsllq $40, %xmm0, %xmm0
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ebx, %eax
+; AVX512F-32-NEXT:    shrb $6, %al
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    # kill: %BL<def> %BL<kill> %EBX<kill> %EBX<def>
+; AVX512F-32-NEXT:    shrb $7, %bl
+; AVX512F-32-NEXT:    kmovd %ebx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
 ; AVX512F-32-NEXT:    movl %ecx, %eax
-; AVX512F-32-NEXT:    shrb $2, %cl
-; AVX512F-32-NEXT:    kmovd %ecx, %k0
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT:    vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT:    shrl $24, %eax
+; AVX512F-32-NEXT:    kmovd %eax, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpbroadcastq %xmm0, %ymm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %eax, %edx
+; AVX512F-32-NEXT:    andb $2, %dl
+; AVX512F-32-NEXT:    shrb %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    movl %eax, %edx
+; AVX512F-32-NEXT:    andb $15, %dl
+; AVX512F-32-NEXT:    movl %edx, %eax
+; AVX512F-32-NEXT:    shrb $2, %dl
+; AVX512F-32-NEXT:    kmovd %edx, %k0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT:    vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
 ; AVX512F-32-NEXT:    shrb $3, %al
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
 ; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm4
-; AVX512F-32-NEXT:    vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
-; AVX512F-32-NEXT:    movl %ebx, %eax
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
+; AVX512F-32-NEXT:    vpblendvb %ymm3, %ymm4, %ymm1, %ymm1
+; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $29, %eax
 ; AVX512F-32-NEXT:    andb $1, %al
 ; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    movl %ebx, %eax
+; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $28, %eax
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm4
-; AVX512F-32-NEXT:    vpbroadcastd %xmm4, %xmm4
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k1
-; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2]
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
-; AVX512F-32-NEXT:    movl %ebx, %eax
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT:    vpbroadcastd %xmm3, %xmm3
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k1
+; AVX512F-32-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2]
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-32-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; AVX512F-32-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $30, %eax
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512F-32-NEXT:    kmovd %eax, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT:    vpbroadcastw %xmm4, %xmm4
-; AVX512F-32-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512F-32-NEXT:    vpblendvb %ymm7, %ymm3, %ymm4, %ymm3
-; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512F-32-NEXT:    vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT:    movl %ebx, %eax
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT:    vpbroadcastw %xmm3, %xmm3
+; AVX512F-32-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-32-NEXT:    vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
+; AVX512F-32-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT:    movl %ecx, %eax
 ; AVX512F-32-NEXT:    shrl $31, %eax
 ; AVX512F-32-NEXT:    kshiftlq $1, %k0, %k0
 ; AVX512F-32-NEXT:    kshiftrq $1, %k0, %k0
 ; AVX512F-32-NEXT:    kmovd %eax, %k1
 ; AVX512F-32-NEXT:    kshiftlq $63, %k1, %k1
 ; AVX512F-32-NEXT:    korq %k1, %k0, %k1
-; AVX512F-32-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT:    vpcmpeqb %zmm6, %zmm5, %k0 {%k1}
+; AVX512F-32-NEXT:    vpcmpltub %zmm6, %zmm5, %k2 {%k1}
+; AVX512F-32-NEXT:    vpcmpleub %zmm6, %zmm5, %k3 {%k1}
+; AVX512F-32-NEXT:    vpcmpneqb %zmm6, %zmm5, %k4 {%k1}
+; AVX512F-32-NEXT:    vpcmpnltub %zmm6, %zmm5, %k5 {%k1}
+; AVX512F-32-NEXT:    vpcmpnleub %zmm6, %zmm5, %k1 {%k1}
 ; AVX512F-32-NEXT:    kmovq %k0, (%esp)
 ; AVX512F-32-NEXT:    movl (%esp), %eax
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    vpcmpltub %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    kmovq %k2, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    vpcmpleub %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    kmovq %k3, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    kxorq %k0, %k0, %k0
 ; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    orl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    vpcmpneqb %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT:    vpcmpnltub %zmm1, %zmm0, %k2 {%k1}
-; AVX512F-32-NEXT:    vpcmpnleub %zmm1, %zmm0, %k1 {%k1}
-; AVX512F-32-NEXT:    kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    kmovq %k4, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    kmovq %k2, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    kmovq %k5, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
 ; AVX512F-32-NEXT:    kmovq %k1, {{[0-9]+}}(%esp)
 ; AVX512F-32-NEXT:    addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT:    addl %ebp, %eax
-; AVX512F-32-NEXT:    adcxl %ebx, %edx
+; AVX512F-32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT:    addl %esi, %eax
+; AVX512F-32-NEXT:    adcl %ecx, %edx
 ; AVX512F-32-NEXT:    addl $60, %esp
 ; AVX512F-32-NEXT:    popl %esi
-; AVX512F-32-NEXT:    popl %edi
 ; AVX512F-32-NEXT:    popl %ebx
-; AVX512F-32-NEXT:    popl %ebp
+; AVX512F-32-NEXT:    vzeroupper
 ; AVX512F-32-NEXT:    retl
   %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
   %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
@@ -3424,6 +3450,7 @@ define i32 @test_cmp_w_512(<32 x i16> %a
 ; AVX512BW-NEXT:    kxnord %k0, %k0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_cmp_w_512:
@@ -3448,6 +3475,7 @@ define i32 @test_cmp_w_512(<32 x i16> %a
 ; AVX512F-32-NEXT:    kxnord %k0, %k0, %k0
 ; AVX512F-32-NEXT:    kmovd %k0, %eax
 ; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vzeroupper
 ; AVX512F-32-NEXT:    retl
   %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
   %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
@@ -3492,6 +3520,7 @@ define i32 @test_mask_cmp_w_512(<32 x i1
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    addl %ecx, %eax
 ; AVX512BW-NEXT:    addl %edi, %eax
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_mask_cmp_w_512:
@@ -3519,6 +3548,7 @@ define i32 @test_mask_cmp_w_512(<32 x i1
 ; AVX512F-32-NEXT:    kmovd %k0, %eax
 ; AVX512F-32-NEXT:    addl %edx, %eax
 ; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vzeroupper
 ; AVX512F-32-NEXT:    retl
   %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
   %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
@@ -3563,6 +3593,7 @@ define i32 @test_ucmp_w_512(<32 x i16> %
 ; AVX512BW-NEXT:    kxnord %k0, %k0, %k0
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    addl %ecx, %eax
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_ucmp_w_512:
@@ -3587,6 +3618,7 @@ define i32 @test_ucmp_w_512(<32 x i16> %
 ; AVX512F-32-NEXT:    kxnord %k0, %k0, %k0
 ; AVX512F-32-NEXT:    kmovd %k0, %eax
 ; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vzeroupper
 ; AVX512F-32-NEXT:    retl
   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
   %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
@@ -3631,6 +3663,7 @@ define i32 @test_mask_ucmp_w_512(<32 x i
 ; AVX512BW-NEXT:    kmovd %k0, %eax
 ; AVX512BW-NEXT:    addl %ecx, %eax
 ; AVX512BW-NEXT:    addl %edi, %eax
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: test_mask_ucmp_w_512:
@@ -3658,6 +3691,7 @@ define i32 @test_mask_ucmp_w_512(<32 x i
 ; AVX512F-32-NEXT:    kmovd %k0, %eax
 ; AVX512F-32-NEXT:    addl %edx, %eax
 ; AVX512F-32-NEXT:    addl %ecx, %eax
+; AVX512F-32-NEXT:    vzeroupper
 ; AVX512F-32-NEXT:    retl
   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
   %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)

Modified: llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll?rev=314306&r1=314305&r2=314306&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll Wed Sep 27 07:44:15 2017
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl --show-mc-encoding| FileCheck %s
 
 declare <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8, <16 x i8>, i16)
 
@@ -9,8 +9,8 @@ define <16 x i8>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpbroadcastb %edi, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x7a,0xcf]
 ; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpbroadcastb %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7a,0xc7]
-; CHECK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
-; CHECK-NEXT:    vpbroadcastb %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7a,0xcf]
+; CHECK-NEXT:    vpbroadcastb %edi, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7a,0xd7]
+; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc2]
 ; CHECK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 -1)
@@ -30,8 +30,8 @@ define <8 x i16>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpbroadcastw %edi, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x7b,0xcf]
 ; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpbroadcastw %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7b,0xc7]
-; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
-; CHECK-NEXT:    vpbroadcastw %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7b,0xcf]
+; CHECK-NEXT:    vpbroadcastw %edi, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7b,0xd7]
+; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
 ; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 -1)
@@ -51,8 +51,8 @@ define <8 x i16>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpbroadcastb %edi, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x7a,0xcf]
 ; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpbroadcastb %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7a,0xc7]
-; CHECK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0]
-; CHECK-NEXT:    vpbroadcastb %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7a,0xcf]
+; CHECK-NEXT:    vpbroadcastb %edi, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7a,0xd7]
+; CHECK-NEXT:    vpaddb %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc2]
 ; CHECK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
     %res = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 -1)
@@ -73,8 +73,8 @@ declare <16 x i16> @llvm.x86.avx512.mask
 ; CHECK-NEXT:    vpbroadcastw %edi, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x7b,0xcf]
 ; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpbroadcastw %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7b,0xc7]
-; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
-; CHECK-NEXT:    vpbroadcastw %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7b,0xcf]
+; CHECK-NEXT:    vpbroadcastw %edi, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7b,0xd7]
+; CHECK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2]
 ; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
     %res = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 -1)
@@ -93,9 +93,9 @@ define <32 x i8>@test_int_x86_avx512_pbr
 ; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0xd0]
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x78,0xc8]
-; CHECK-NEXT:    vpaddb %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc9]
 ; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0]
-; CHECK-NEXT:    vpaddb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc1]
+; CHECK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0]
+; CHECK-NEXT:    vpaddb %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1)
   %res1 = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask)
@@ -113,9 +113,9 @@ define <16 x i8>@test_int_x86_avx512_pbr
 ; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0xd0]
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x78,0xc8]
-; CHECK-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc9]
 ; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0]
-; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
+; CHECK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
+; CHECK-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1)
   %res1 = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask)
@@ -133,9 +133,9 @@ define <16 x i16>@test_int_x86_avx512_pb
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xd0]
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x79,0xc8]
-; CHECK-NEXT:    vpaddw %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc9]
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0]
-; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1]
+; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
+; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1)
   %res1 = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask)
@@ -153,9 +153,9 @@ define <8 x i16>@test_int_x86_avx512_pbr
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0xd0]
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x79,0xc8]
-; CHECK-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc9]
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0]
-; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
+; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1)
   %res1 = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask)
@@ -173,9 +173,9 @@ define <64 x i8>@test_int_x86_avx512_pbr
 ; CHECK-NEXT:    vpbroadcastb %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x78,0xd0]
 ; CHECK-NEXT:    kmovq %rdi, %k1 ## encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpbroadcastb %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8]
-; CHECK-NEXT:    vpaddb %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc9]
 ; CHECK-NEXT:    vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0]
-; CHECK-NEXT:    vpaddb %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1]
+; CHECK-NEXT:    vpaddb %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0]
+; CHECK-NEXT:    vpaddb %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1)
   %res1 = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask)
@@ -193,9 +193,9 @@ define <32 x i16>@test_int_x86_avx512_pb
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x79,0xd0]
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8]
-; CHECK-NEXT:    vpaddw %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc9]
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0]
-; CHECK-NEXT:    vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1]
+; CHECK-NEXT:    vpaddw %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0]
+; CHECK-NEXT:    vpaddw %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1)
   %res1 = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask)
@@ -227,6 +227,7 @@ define void at test_int_x86_avx512_mask_sto
 ; CHECK-NEXT:    kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca]
 ; CHECK-NEXT:    vmovdqu8 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x7f,0x07]
 ; CHECK-NEXT:    vmovdqu %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x06]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   call void @llvm.x86.avx512.mask.storeu.b.256(i8* %ptr1, <32 x i8> %x1, i32 %x2)
   call void @llvm.x86.avx512.mask.storeu.b.256(i8* %ptr2, <32 x i8> %x1, i32 -1)
@@ -255,6 +256,7 @@ define void at test_int_x86_avx512_mask_sto
 ; CHECK-NEXT:    kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca]
 ; CHECK-NEXT:    vmovdqu16 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x7f,0x07]
 ; CHECK-NEXT:    vmovdqu %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x06]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   call void @llvm.x86.avx512.mask.storeu.w.256(i8* %ptr1, <16 x i16> %x1, i16 %x2)
   call void @llvm.x86.avx512.mask.storeu.w.256(i8* %ptr2, <16 x i16> %x1, i16 -1)
@@ -345,8 +347,8 @@ define <16 x i8>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    ## xmm2 {%k1} = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1]
 ; CHECK-NEXT:    vpalignr $2, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x0f,0xc1,0x02]
 ; CHECK-NEXT:    ## xmm0 {%k1} {z} = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1]
-; CHECK-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
 ; CHECK-NEXT:    vpaddb %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3]
+; CHECK-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 %x4)
   %res1 = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> zeroinitializer, i16 %x4)
@@ -368,8 +370,8 @@ define <32 x i8>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    ## ymm2 {%k1} = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17]
 ; CHECK-NEXT:    vpalignr $2, %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x0f,0xc1,0x02]
 ; CHECK-NEXT:    ## ymm0 {%k1} {z} = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17]
-; CHECK-NEXT:    vpaddb %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0]
 ; CHECK-NEXT:    vpaddb %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3]
+; CHECK-NEXT:    vpaddb %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 %x4)
   %res1 = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> zeroinitializer, i32 %x4)
@@ -391,8 +393,8 @@ define <8 x i16>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    ## xmm1 {%k1} = xmm0[0,1,2,3,7,4,4,4]
 ; CHECK-NEXT:    vpshufhw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x70,0xc0,0x03]
 ; CHECK-NEXT:    ## xmm0 {%k1} {z} = xmm0[0,1,2,3,7,4,4,4]
-; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
 ; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
   %res1 = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3)
@@ -414,8 +416,8 @@ define <16 x i16>@test_int_x86_avx512_ma
 ; CHECK-NEXT:    ## ymm1 {%k1} = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12]
 ; CHECK-NEXT:    vpshufhw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x70,0xc0,0x03]
 ; CHECK-NEXT:    ## ymm0 {%k1} {z} = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12]
-; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
 ; CHECK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2]
+; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
   %res1 = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3)
@@ -437,8 +439,8 @@ define <8 x i16>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    ## xmm1 {%k1} = xmm0[3,0,0,0,4,5,6,7]
 ; CHECK-NEXT:    vpshuflw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0x89,0x70,0xc0,0x03]
 ; CHECK-NEXT:    ## xmm0 {%k1} {z} = xmm0[3,0,0,0,4,5,6,7]
-; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
 ; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
   %res1 = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3)
@@ -460,8 +462,8 @@ define <16 x i16>@test_int_x86_avx512_ma
 ; CHECK-NEXT:    ## ymm1 {%k1} = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15]
 ; CHECK-NEXT:    vpshuflw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0xa9,0x70,0xc0,0x03]
 ; CHECK-NEXT:    ## ymm0 {%k1} {z} = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15]
-; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
 ; CHECK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2]
+; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
   %res1 = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3)
@@ -476,6 +478,7 @@ define i32 @test_pcmpeq_b_256(<32 x i8>
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
   ret i32 %res
@@ -487,6 +490,7 @@ define i32 @test_mask_pcmpeq_b_256(<32 x
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
   ret i32 %res
@@ -500,6 +504,7 @@ define i16 @test_pcmpeq_w_256(<16 x i16>
 ; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
 ; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
   ret i16 %res
@@ -512,6 +517,7 @@ define i16 @test_mask_pcmpeq_w_256(<16 x
 ; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
 ; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
   ret i16 %res
@@ -524,6 +530,7 @@ define i32 @test_pcmpgt_b_256(<32 x i8>
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpgtb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x64,0xc1]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
   ret i32 %res
@@ -535,6 +542,7 @@ define i32 @test_mask_pcmpgt_b_256(<32 x
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
   ret i32 %res
@@ -548,6 +556,7 @@ define i16 @test_pcmpgt_w_256(<16 x i16>
 ; CHECK-NEXT:    vpcmpgtw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x65,0xc1]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
 ; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
   ret i16 %res
@@ -560,6 +569,7 @@ define i16 @test_mask_pcmpgt_w_256(<16 x
 ; CHECK-NEXT:    vpcmpgtw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc1]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
 ; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
   ret i16 %res
@@ -1660,9 +1670,9 @@ define <8 x i16>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpsrlw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xd9]
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsrlw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd1,0xd1]
-; CHECK-NEXT:    vpaddw %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xd3]
 ; CHECK-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd1,0xc1]
-; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT:    vpaddw %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfd,0xc0]
+; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
   %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
@@ -1680,8 +1690,8 @@ define <16 x i16>@test_int_x86_avx512_ma
 ; CHECK-NEXT:    vpsrlw %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xd9]
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsrlw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd1,0xd1]
-; CHECK-NEXT:    vpaddw %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xd3]
 ; CHECK-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd1,0xc1]
+; CHECK-NEXT:    vpaddw %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfd,0xc0]
 ; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3)
@@ -1701,8 +1711,8 @@ define <8 x i16>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsraw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe1,0xd1]
 ; CHECK-NEXT:    vpsraw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe1,0xc1]
-; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
 ; CHECK-NEXT:    vpaddw %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3]
+; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
   %res1 = call <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
@@ -1721,8 +1731,8 @@ define <16 x i16>@test_int_x86_avx512_ma
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsraw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe1,0xd1]
 ; CHECK-NEXT:    vpsraw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe1,0xc1]
-; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
 ; CHECK-NEXT:    vpaddw %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3]
+; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3)
   %res1 = call <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
@@ -1741,8 +1751,8 @@ define <8 x i16>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsllw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf1,0xd1]
 ; CHECK-NEXT:    vpsllw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf1,0xc1]
-; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
 ; CHECK-NEXT:    vpaddw %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3]
+; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.mask.psll.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
   %res1 = call <8 x i16> @llvm.x86.avx512.mask.psll.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
@@ -1761,8 +1771,8 @@ define <16 x i16>@test_int_x86_avx512_ma
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsllw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf1,0xd1]
 ; CHECK-NEXT:    vpsllw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf1,0xc1]
-; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
 ; CHECK-NEXT:    vpaddw %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3]
+; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.psll.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3)
   %res1 = call <16 x i16> @llvm.x86.avx512.mask.psll.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
@@ -1780,9 +1790,9 @@ define <8 x i16>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpsrlw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xd0,0x03]
 ; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsrlw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xd0,0x03]
-; CHECK-NEXT:    vpaddw %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xca]
 ; CHECK-NEXT:    vpsrlw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x03]
-; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
+; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
   %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 -1)
@@ -1800,8 +1810,8 @@ define <16 x i16>@test_int_x86_avx512_ma
 ; CHECK-NEXT:    vpsrlw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xd0,0x03]
 ; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsrlw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xd0,0x03]
-; CHECK-NEXT:    vpaddw %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xca]
 ; CHECK-NEXT:    vpsrlw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x03]
+; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
 ; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
@@ -1821,8 +1831,8 @@ define <8 x i16>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsraw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xe0,0x03]
 ; CHECK-NEXT:    vpsraw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xe0,0x03]
-; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
 ; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
   %res1 = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3)
@@ -1841,8 +1851,8 @@ define <16 x i16>@test_int_x86_avx512_ma
 ; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsraw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xe0,0x03]
 ; CHECK-NEXT:    vpsraw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xe0,0x03]
-; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
 ; CHECK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2]
+; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
   %res1 = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3)
@@ -1861,8 +1871,8 @@ define <8 x i16>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsllw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xf0,0x03]
 ; CHECK-NEXT:    vpsllw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xf0,0x03]
-; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
 ; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
   %res1 = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3)
@@ -1881,8 +1891,8 @@ define <16 x i16>@test_int_x86_avx512_ma
 ; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsllw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xf0,0x03]
 ; CHECK-NEXT:    vpsllw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xf0,0x03]
-; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
 ; CHECK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2]
+; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
   %res1 = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3)
@@ -1936,8 +1946,8 @@ define <8 x i16>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    ## xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; CHECK-NEXT:    vpmovzxbw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x30,0xc0]
 ; CHECK-NEXT:    ## xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
 ; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2)
   %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -1959,8 +1969,8 @@ define <16 x i16>@test_int_x86_avx512_ma
 ; CHECK-NEXT:    ## ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; CHECK-NEXT:    vpmovzxbw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x30,0xc0]
 ; CHECK-NEXT:    ## ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
 ; CHECK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2]
+; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2)
   %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> zeroinitializer, i16 %x2)
@@ -1980,8 +1990,8 @@ define <8 x i16>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmovsxbw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x20,0xc8]
 ; CHECK-NEXT:    vpmovsxbw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x20,0xc0]
-; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
 ; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2)
   %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -2000,8 +2010,8 @@ define <16 x i16>@test_int_x86_avx512_ma
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmovsxbw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x20,0xc8]
 ; CHECK-NEXT:    vpmovsxbw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x20,0xc0]
-; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
 ; CHECK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2]
+; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2)
   %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> zeroinitializer, i16 %x2)
@@ -2020,8 +2030,8 @@ define <2 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmovsxdq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x25,0xc8]
 ; CHECK-NEXT:    vpmovsxdq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x25,0xc0]
-; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
 ; CHECK-NEXT:    vpaddq %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2]
+; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2)
   %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> zeroinitializer, i8 %x2)
@@ -2040,8 +2050,8 @@ define <4 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmovsxdq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x25,0xc8]
 ; CHECK-NEXT:    vpmovsxdq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x25,0xc0]
-; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
 ; CHECK-NEXT:    vpaddq %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2]
+; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2)
   %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> zeroinitializer, i8 %x2)
@@ -2834,23 +2844,23 @@ define <8 x i32> @test_mask_cmp_b_256(<3
 ; CHECK-NEXT:    vpcmpgtb %ymm0, %ymm1, %k0 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x64,0xc0]
 ; CHECK-NEXT:    kmovd %k0, %r9d ## encoding: [0xc5,0x7b,0x93,0xc8]
 ; CHECK-NEXT:    vpcmpleb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x02]
-; CHECK-NEXT:    kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
-; CHECK-NEXT:    vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04]
+; CHECK-NEXT:    kmovd %k0, %r10d ## encoding: [0xc5,0x7b,0x93,0xd0]
+; CHECK-NEXT:    kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0]
 ; CHECK-NEXT:    kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
-; CHECK-NEXT:    vpcmpleb %ymm0, %ymm1, %k0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x3f,0xc0,0x02]
+; CHECK-NEXT:    vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT:    vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1]
+; CHECK-NEXT:    vpcmpleb %ymm0, %ymm1, %k0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x3f,0xc0,0x02]
 ; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
-; CHECK-NEXT:    vmovd %esi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6]
-; CHECK-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01]
-; CHECK-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x02]
+; CHECK-NEXT:    vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1]
+; CHECK-NEXT:    kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
+; CHECK-NEXT:    vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
+; CHECK-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x01]
+; CHECK-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x02]
 ; CHECK-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x03]
 ; CHECK-NEXT:    vmovd %r8d, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xc8]
 ; CHECK-NEXT:    vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
-; CHECK-NEXT:    vpinsrd $2, %edx, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xca,0x02]
-; CHECK-NEXT:    kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0]
-; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xc8,0x03]
+; CHECK-NEXT:    vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
+; CHECK-NEXT:    vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
 ; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
@@ -2932,23 +2942,23 @@ define <8 x i32> @test_mask_ucmp_b_256(<
 ; CHECK-NEXT:    vpcmpltub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x01]
 ; CHECK-NEXT:    kmovd %k0, %r9d ## encoding: [0xc5,0x7b,0x93,0xc8]
 ; CHECK-NEXT:    vpcmpleub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x02]
-; CHECK-NEXT:    kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
-; CHECK-NEXT:    vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04]
+; CHECK-NEXT:    kmovd %k0, %r10d ## encoding: [0xc5,0x7b,0x93,0xd0]
+; CHECK-NEXT:    kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0]
 ; CHECK-NEXT:    kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
-; CHECK-NEXT:    vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x05]
+; CHECK-NEXT:    vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT:    vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x06]
+; CHECK-NEXT:    vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x05]
 ; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
-; CHECK-NEXT:    vmovd %esi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6]
-; CHECK-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01]
-; CHECK-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x02]
+; CHECK-NEXT:    vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x06]
+; CHECK-NEXT:    kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
+; CHECK-NEXT:    vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
+; CHECK-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x01]
+; CHECK-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x02]
 ; CHECK-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x03]
 ; CHECK-NEXT:    vmovd %r8d, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xc8]
 ; CHECK-NEXT:    vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
-; CHECK-NEXT:    vpinsrd $2, %edx, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xca,0x02]
-; CHECK-NEXT:    kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0]
-; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xc8,0x03]
+; CHECK-NEXT:    vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
+; CHECK-NEXT:    vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
 ; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
@@ -2997,6 +3007,7 @@ define <8 x i16> @test_cmp_w_256(<16 x i
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k0 ## encoding: [0xc5,0xfc,0x46,0xc0]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
 ; CHECK-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
@@ -3043,6 +3054,7 @@ define <8 x i16> @test_mask_cmp_w_256(<1
 ; CHECK-NEXT:    kmovd %k1, %eax ## encoding: [0xc5,0xfb,0x93,0xc1]
 ; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
 ; CHECK-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
@@ -3090,6 +3102,7 @@ define <8 x i16> @test_ucmp_w_256(<16 x
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k0 ## encoding: [0xc5,0xfc,0x46,0xc0]
 ; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
 ; CHECK-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
@@ -3136,6 +3149,7 @@ define <8 x i16> @test_mask_ucmp_w_256(<
 ; CHECK-NEXT:    kmovd %k1, %eax ## encoding: [0xc5,0xfb,0x93,0xc1]
 ; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
 ; CHECK-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
   %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0




More information about the llvm-commits mailing list