[llvm] r314306 - [X86][SKX][KNL] Updated regression tests to use -mattr instead of -mcpu flag.NFC.
Gadi Haber via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 27 07:44:15 PDT 2017
Author: gadi.haber
Date: Wed Sep 27 07:44:15 2017
New Revision: 314306
URL: http://llvm.org/viewvc/llvm-project?rev=314306&view=rev
Log:
[X86][SKX][KNL] Updated regression tests to use -mattr instead of -mcpu flag.NFC.
NFC.
Updated 8 regression tests to use -mattr instead of -mcpu flag as follows:
-mcpu=knl --> -mattr=+avx512f
-mcpu=skx --> -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq
The updates are as part of the preparation of a large commit to add all instruction scheduling for the SKX target.
Reviewers: delena, zvi, RKSimon
Differential Revision: https://reviews.llvm.org/D38222
Change-Id: I2381c9b5bb75ecacfca017243c22d054f6eddd14
Modified:
llvm/trunk/test/CodeGen/X86/avx512-cvt.ll
llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll
llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
llvm/trunk/test/CodeGen/X86/sse42-intrinsics-x86.ll
Modified: llvm/trunk/test/CodeGen/X86/avx512-cvt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-cvt.ll?rev=314306&r1=314305&r2=314306&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-cvt.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-cvt.ll Wed Sep 27 07:44:15 2017
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLNOBW --check-prefix=AVX512VL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NOVL --check-prefix=DQ --check-prefix=AVX512DQ
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=AVX512BW
@@ -143,46 +143,29 @@ define <2 x float> @sltof2f32(<2 x i64>
}
define <4 x float> @slto4f32_mem(<4 x i64>* %a) {
-; KNL-LABEL: slto4f32_mem:
-; KNL: # BB#0:
-; KNL-NEXT: vmovdqu (%rdi), %ymm0
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
-; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
-; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
-; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; KNL-NEXT: retq
+; NODQ-LABEL: slto4f32_mem:
+; NODQ: # BB#0:
+; NODQ-NEXT: vmovdqu (%rdi), %ymm0
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
+; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; NODQ-NEXT: vzeroupper
+; NODQ-NEXT: retq
;
; VLDQ-LABEL: slto4f32_mem:
; VLDQ: # BB#0:
; VLDQ-NEXT: vcvtqq2psy (%rdi), %xmm0
; VLDQ-NEXT: retq
;
-; VLNODQ-LABEL: slto4f32_mem:
-; VLNODQ: # BB#0:
-; VLNODQ-NEXT: vmovdqu (%rdi), %ymm0
-; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
-; VLNODQ-NEXT: vmovq %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; VLNODQ-NEXT: vmovq %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; VLNODQ-NEXT: vzeroupper
-; VLNODQ-NEXT: retq
-;
; AVX512DQ-LABEL: slto4f32_mem:
; AVX512DQ: # BB#0:
; AVX512DQ-NEXT: vmovups (%rdi), %ymm0
@@ -190,24 +173,6 @@ define <4 x float> @slto4f32_mem(<4 x i6
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: slto4f32_mem:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
%a1 = load <4 x i64>, <4 x i64>* %a, align 8
%b = sitofp <4 x i64> %a1 to <4 x float>
ret <4 x float>%b
@@ -282,21 +247,22 @@ define <4 x i64> @f32to4sl(<4 x float> %
}
define <4 x float> @slto4f32(<4 x i64> %a) {
-; KNL-LABEL: slto4f32:
-; KNL: # BB#0:
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
-; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
-; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
-; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; KNL-NEXT: retq
+; NODQ-LABEL: slto4f32:
+; NODQ: # BB#0:
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
+; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; NODQ-NEXT: vzeroupper
+; NODQ-NEXT: retq
;
; VLDQ-LABEL: slto4f32:
; VLDQ: # BB#0:
@@ -304,23 +270,6 @@ define <4 x float> @slto4f32(<4 x i64> %
; VLDQ-NEXT: vzeroupper
; VLDQ-NEXT: retq
;
-; VLNODQ-LABEL: slto4f32:
-; VLNODQ: # BB#0:
-; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
-; VLNODQ-NEXT: vmovq %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; VLNODQ-NEXT: vmovq %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; VLNODQ-NEXT: vzeroupper
-; VLNODQ-NEXT: retq
-;
; AVX512DQ-LABEL: slto4f32:
; AVX512DQ: # BB#0:
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
@@ -328,43 +277,27 @@ define <4 x float> @slto4f32(<4 x i64> %
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: slto4f32:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm1, %xmm1
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm2
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm0
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
%b = sitofp <4 x i64> %a to <4 x float>
ret <4 x float> %b
}
define <4 x float> @ulto4f32(<4 x i64> %a) {
-; KNL-LABEL: ulto4f32:
-; KNL: # BB#0:
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
-; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2
-; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
-; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; KNL-NEXT: retq
+; NODQ-LABEL: ulto4f32:
+; NODQ: # BB#0:
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
+; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; NODQ-NEXT: vzeroupper
+; NODQ-NEXT: retq
;
; VLDQ-LABEL: ulto4f32:
; VLDQ: # BB#0:
@@ -372,23 +305,6 @@ define <4 x float> @ulto4f32(<4 x i64> %
; VLDQ-NEXT: vzeroupper
; VLDQ-NEXT: retq
;
-; VLNODQ-LABEL: ulto4f32:
-; VLNODQ: # BB#0:
-; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
-; VLNODQ-NEXT: vmovq %xmm0, %rax
-; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; VLNODQ-NEXT: vmovq %xmm0, %rax
-; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; VLNODQ-NEXT: vzeroupper
-; VLNODQ-NEXT: retq
-;
; AVX512DQ-LABEL: ulto4f32:
; AVX512DQ: # BB#0:
; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
@@ -396,23 +312,6 @@ define <4 x float> @ulto4f32(<4 x i64> %
; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: ulto4f32:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm1, %xmm1
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm2
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm0
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
%b = uitofp <4 x i64> %a to <4 x float>
ret <4 x float> %b
}
@@ -457,179 +356,67 @@ define <8 x double> @ulto8f64(<8 x i64>
}
define <16 x double> @ulto16f64(<16 x i64> %a) {
-; KNL-LABEL: ulto16f64:
-; KNL: # BB#0:
-; KNL-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; KNL-NEXT: vpextrq $1, %xmm2, %rax
-; KNL-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm3
-; KNL-NEXT: vmovq %xmm2, %rax
-; KNL-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2
-; KNL-NEXT: vextracti32x4 $2, %zmm0, %xmm4
-; KNL-NEXT: vpextrq $1, %xmm4, %rax
-; KNL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
-; KNL-NEXT: vmovq %xmm4, %rax
-; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
-; KNL-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; KNL-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm3
-; KNL-NEXT: vpextrq $1, %xmm3, %rax
-; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
-; KNL-NEXT: vmovq %xmm3, %rax
-; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm0
-; KNL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; KNL-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; KNL-NEXT: vextracti32x4 $3, %zmm1, %xmm3
-; KNL-NEXT: vpextrq $1, %xmm3, %rax
-; KNL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm2
-; KNL-NEXT: vmovq %xmm3, %rax
-; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
-; KNL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; KNL-NEXT: vextracti32x4 $2, %zmm1, %xmm3
-; KNL-NEXT: vpextrq $1, %xmm3, %rax
-; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
-; KNL-NEXT: vmovq %xmm3, %rax
-; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
-; KNL-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; KNL-NEXT: vextracti128 $1, %ymm1, %xmm3
-; KNL-NEXT: vpextrq $1, %xmm3, %rax
-; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
-; KNL-NEXT: vmovq %xmm3, %rax
-; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
-; KNL-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL-NEXT: vpextrq $1, %xmm1, %rax
-; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
-; KNL-NEXT: vmovq %xmm1, %rax
-; KNL-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm1
-; KNL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; KNL-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; KNL-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
-; KNL-NEXT: retq
+; NODQ-LABEL: ulto16f64:
+; NODQ: # BB#0:
+; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NODQ-NEXT: vpextrq $1, %xmm2, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm3
+; NODQ-NEXT: vmovq %xmm2, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3
+; NODQ-NEXT: vpextrq $1, %xmm3, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm4
+; NODQ-NEXT: vmovq %xmm3, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm3
+; NODQ-NEXT: vpextrq $1, %xmm3, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vmovq %xmm3, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm0
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm2
+; NODQ-NEXT: vpextrq $1, %xmm2, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vmovq %xmm2, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm2
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm3
+; NODQ-NEXT: vpextrq $1, %xmm3, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vmovq %xmm3, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm3
+; NODQ-NEXT: vpextrq $1, %xmm3, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vmovq %xmm3, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT: vpextrq $1, %xmm1, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vmovq %xmm1, %rax
+; NODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm1
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
+; NODQ-NEXT: retq
;
; DQ-LABEL: ulto16f64:
; DQ: # BB#0:
; DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
; DQ-NEXT: vcvtuqq2pd %zmm1, %zmm1
; DQ-NEXT: retq
-;
-; VLNODQ-LABEL: ulto16f64:
-; VLNODQ: # BB#0:
-; VLNODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; VLNODQ-NEXT: vpextrq $1, %xmm2, %rax
-; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm3
-; VLNODQ-NEXT: vmovq %xmm2, %rax
-; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2
-; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; VLNODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3
-; VLNODQ-NEXT: vpextrq $1, %xmm3, %rax
-; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm4
-; VLNODQ-NEXT: vmovq %xmm3, %rax
-; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
-; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; VLNODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm3
-; VLNODQ-NEXT: vpextrq $1, %xmm3, %rax
-; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
-; VLNODQ-NEXT: vmovq %xmm3, %rax
-; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
-; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
-; VLNODQ-NEXT: vmovq %xmm0, %rax
-; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm0
-; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; VLNODQ-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; VLNODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; VLNODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm2
-; VLNODQ-NEXT: vpextrq $1, %xmm2, %rax
-; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
-; VLNODQ-NEXT: vmovq %xmm2, %rax
-; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm2
-; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; VLNODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm3
-; VLNODQ-NEXT: vpextrq $1, %xmm3, %rax
-; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
-; VLNODQ-NEXT: vmovq %xmm3, %rax
-; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
-; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; VLNODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; VLNODQ-NEXT: vextracti128 $1, %ymm1, %xmm3
-; VLNODQ-NEXT: vpextrq $1, %xmm3, %rax
-; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
-; VLNODQ-NEXT: vmovq %xmm3, %rax
-; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
-; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; VLNODQ-NEXT: vpextrq $1, %xmm1, %rax
-; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
-; VLNODQ-NEXT: vmovq %xmm1, %rax
-; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm1
-; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; VLNODQ-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; VLNODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
-; VLNODQ-NEXT: retq
-;
-; AVX512BW-LABEL: ulto16f64:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; AVX512BW-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm3, %xmm3
-; AVX512BW-NEXT: vmovq %xmm2, %rax
-; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm2
-; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm3
-; AVX512BW-NEXT: vpextrq $1, %xmm3, %rax
-; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm4, %xmm4
-; AVX512BW-NEXT: vmovq %xmm3, %rax
-; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
-; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX512BW-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512BW-NEXT: vpextrq $1, %xmm3, %rax
-; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
-; AVX512BW-NEXT: vmovq %xmm3, %rax
-; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
-; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm0
-; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; AVX512BW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm2
-; AVX512BW-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
-; AVX512BW-NEXT: vmovq %xmm2, %rax
-; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm2
-; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm3
-; AVX512BW-NEXT: vpextrq $1, %xmm3, %rax
-; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
-; AVX512BW-NEXT: vmovq %xmm3, %rax
-; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
-; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX512BW-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX512BW-NEXT: vpextrq $1, %xmm3, %rax
-; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
-; AVX512BW-NEXT: vmovq %xmm3, %rax
-; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm3
-; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX512BW-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm4
-; AVX512BW-NEXT: vmovq %xmm1, %rax
-; AVX512BW-NEXT: vcvtusi2sdq %rax, %xmm5, %xmm1
-; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; AVX512BW-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX512BW-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512BW-NEXT: retq
%b = uitofp <16 x i64> %a to <16 x double>
ret <16 x double> %b
}
@@ -653,18 +440,12 @@ define <16 x i32> @f32to16ui(<16 x float
}
define <16 x i8> @f32to16uc(<16 x float> %f) {
-; KNL-LABEL: f32to16uc:
-; KNL: # BB#0:
-; KNL-NEXT: vcvttps2udq %zmm0, %zmm0
-; KNL-NEXT: vpmovdb %zmm0, %xmm0
-; KNL-NEXT: retq
-;
-; AVX512-LABEL: f32to16uc:
-; AVX512: # BB#0:
-; AVX512-NEXT: vcvttps2udq %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; ALL-LABEL: f32to16uc:
+; ALL: # BB#0:
+; ALL-NEXT: vcvttps2udq %zmm0, %zmm0
+; ALL-NEXT: vpmovdb %zmm0, %xmm0
+; ALL-NEXT: vzeroupper
+; ALL-NEXT: retq
%res = fptoui <16 x float> %f to <16 x i8>
ret <16 x i8> %res
}
@@ -696,33 +477,18 @@ define <8 x i32> @f32to8ui(<8 x float> %
}
define <4 x i32> @f32to4ui(<4 x float> %a) nounwind {
-; KNL-LABEL: f32to4ui:
-; KNL: # BB#0:
-; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; KNL-NEXT: vcvttps2udq %zmm0, %zmm0
-; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; KNL-NEXT: retq
+; NOVL-LABEL: f32to4ui:
+; NOVL: # BB#0:
+; NOVL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; NOVL-NEXT: vcvttps2udq %zmm0, %zmm0
+; NOVL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
;
; VL-LABEL: f32to4ui:
; VL: # BB#0:
; VL-NEXT: vcvttps2udq %xmm0, %xmm0
; VL-NEXT: retq
-;
-; AVX512DQ-LABEL: f32to4ui:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: f32to4ui:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512BW-NEXT: vcvttps2udq %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
%b = fptoui <4 x float> %a to <4 x i32>
ret <4 x i32> %b
}
@@ -737,12 +503,13 @@ define <8 x i32> @f64to8ui(<8 x double>
}
define <8 x i16> @f64to8us(<8 x double> %f) {
-; KNL-LABEL: f64to8us:
-; KNL: # BB#0:
-; KNL-NEXT: vcvttpd2dq %zmm0, %ymm0
-; KNL-NEXT: vpmovdw %zmm0, %ymm0
-; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; KNL-NEXT: retq
+; NOVL-LABEL: f64to8us:
+; NOVL: # BB#0:
+; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0
+; NOVL-NEXT: vpmovdw %zmm0, %ymm0
+; NOVL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
;
; VL-LABEL: f64to8us:
; VL: # BB#0:
@@ -750,33 +517,18 @@ define <8 x i16> @f64to8us(<8 x double>
; VL-NEXT: vpmovdw %ymm0, %xmm0
; VL-NEXT: vzeroupper
; VL-NEXT: retq
-;
-; AVX512DQ-LABEL: f64to8us:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0
-; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: f64to8us:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vcvttpd2dq %zmm0, %ymm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
%res = fptoui <8 x double> %f to <8 x i16>
ret <8 x i16> %res
}
define <8 x i8> @f64to8uc(<8 x double> %f) {
-; KNL-LABEL: f64to8uc:
-; KNL: # BB#0:
-; KNL-NEXT: vcvttpd2dq %zmm0, %ymm0
-; KNL-NEXT: vpmovdw %zmm0, %ymm0
-; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; KNL-NEXT: retq
+; NOVL-LABEL: f64to8uc:
+; NOVL: # BB#0:
+; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0
+; NOVL-NEXT: vpmovdw %zmm0, %ymm0
+; NOVL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
;
; VL-LABEL: f64to8uc:
; VL: # BB#0:
@@ -784,55 +536,24 @@ define <8 x i8> @f64to8uc(<8 x double> %
; VL-NEXT: vpmovdw %ymm0, %xmm0
; VL-NEXT: vzeroupper
; VL-NEXT: retq
-;
-; AVX512DQ-LABEL: f64to8uc:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0
-; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: f64to8uc:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vcvttpd2dq %zmm0, %ymm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
%res = fptoui <8 x double> %f to <8 x i8>
ret <8 x i8> %res
}
define <4 x i32> @f64to4ui(<4 x double> %a) nounwind {
-; KNL-LABEL: f64to4ui:
-; KNL: # BB#0:
-; KNL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; KNL-NEXT: vcvttpd2udq %zmm0, %ymm0
-; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; KNL-NEXT: retq
+; NOVL-LABEL: f64to4ui:
+; NOVL: # BB#0:
+; NOVL-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; NOVL-NEXT: vcvttpd2udq %zmm0, %ymm0
+; NOVL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
;
; VL-LABEL: f64to4ui:
; VL: # BB#0:
; VL-NEXT: vcvttpd2udq %ymm0, %xmm0
; VL-NEXT: vzeroupper
; VL-NEXT: retq
-;
-; AVX512DQ-LABEL: f64to4ui:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: f64to4ui:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; AVX512BW-NEXT: vcvttpd2udq %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
%b = fptoui <4 x double> %a to <4 x i32>
ret <4 x i32> %b
}
@@ -926,16 +647,11 @@ define <8 x i32> @f64to8si(<8 x double>
}
define <4 x i32> @f64to4si(<4 x double> %a) {
-; KNL-LABEL: f64to4si:
-; KNL: # BB#0:
-; KNL-NEXT: vcvttpd2dq %ymm0, %xmm0
-; KNL-NEXT: retq
-;
-; AVX512-LABEL: f64to4si:
-; AVX512: # BB#0:
-; AVX512-NEXT: vcvttpd2dq %ymm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; ALL-LABEL: f64to4si:
+; ALL: # BB#0:
+; ALL-NEXT: vcvttpd2dq %ymm0, %xmm0
+; ALL-NEXT: vzeroupper
+; ALL-NEXT: retq
%b = fptosi <4 x double> %a to <4 x i32>
ret <4 x i32> %b
}
@@ -952,28 +668,24 @@ define <16 x float> @f64to16f32(<16 x do
}
define <4 x float> @f64to4f32(<4 x double> %b) {
-; KNL-LABEL: f64to4f32:
-; KNL: # BB#0:
-; KNL-NEXT: vcvtpd2ps %ymm0, %xmm0
-; KNL-NEXT: retq
-;
-; AVX512-LABEL: f64to4f32:
-; AVX512: # BB#0:
-; AVX512-NEXT: vcvtpd2ps %ymm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; ALL-LABEL: f64to4f32:
+; ALL: # BB#0:
+; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0
+; ALL-NEXT: vzeroupper
+; ALL-NEXT: retq
%a = fptrunc <4 x double> %b to <4 x float>
ret <4 x float> %a
}
define <4 x float> @f64to4f32_mask(<4 x double> %b, <4 x i1> %mask) {
-; KNL-LABEL: f64to4f32_mask:
-; KNL: # BB#0:
-; KNL-NEXT: vpslld $31, %xmm1, %xmm1
-; KNL-NEXT: vpsrad $31, %xmm1, %xmm1
-; KNL-NEXT: vcvtpd2ps %ymm0, %xmm0
-; KNL-NEXT: vpand %xmm0, %xmm1, %xmm0
-; KNL-NEXT: retq
+; NOVL-LABEL: f64to4f32_mask:
+; NOVL: # BB#0:
+; NOVL-NEXT: vpslld $31, %xmm1, %xmm1
+; NOVL-NEXT: vpsrad $31, %xmm1, %xmm1
+; NOVL-NEXT: vcvtpd2ps %ymm0, %xmm0
+; NOVL-NEXT: vpand %xmm0, %xmm1, %xmm0
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
;
; VL-LABEL: f64to4f32_mask:
; VL: # BB#0:
@@ -982,24 +694,6 @@ define <4 x float> @f64to4f32_mask(<4 x
; VL-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
; VL-NEXT: vzeroupper
; VL-NEXT: retq
-;
-; AVX512DQ-LABEL: f64to4f32_mask:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpslld $31, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpsrad $31, %xmm1, %xmm1
-; AVX512DQ-NEXT: vcvtpd2ps %ymm0, %xmm0
-; AVX512DQ-NEXT: vpand %xmm0, %xmm1, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: f64to4f32_mask:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpslld $31, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsrad $31, %xmm1, %xmm1
-; AVX512BW-NEXT: vcvtpd2ps %ymm0, %xmm0
-; AVX512BW-NEXT: vpand %xmm0, %xmm1, %xmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
%a = fptrunc <4 x double> %b to <4 x float>
%c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer
ret <4 x float> %c
@@ -1180,100 +874,40 @@ define <16 x double> @uito16f64(<16 x i3
}
define <8 x float> @slto8f32(<8 x i64> %a) {
-; KNL-LABEL: slto8f32:
-; KNL: # BB#0:
-; KNL-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; KNL-NEXT: vpextrq $1, %xmm1, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
-; KNL-NEXT: vmovq %xmm1, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1
-; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; KNL-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; KNL-NEXT: vmovq %xmm2, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
-; KNL-NEXT: vpextrq $1, %xmm2, %rax
-; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
-; KNL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm3[0],xmm2[3]
-; KNL-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
-; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
-; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT: retq
+; NODQ-LABEL: slto8f32:
+; NODQ: # BB#0:
+; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; NODQ-NEXT: vpextrq $1, %xmm1, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT: vmovq %xmm1, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NODQ-NEXT: vmovq %xmm2, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; NODQ-NEXT: vpextrq $1, %xmm2, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
+; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
+; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0
+; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; NODQ-NEXT: retq
;
; DQ-LABEL: slto8f32:
; DQ: # BB#0:
; DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
; DQ-NEXT: retq
-;
-; VLNODQ-LABEL: slto8f32:
-; VLNODQ: # BB#0:
-; VLNODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; VLNODQ-NEXT: vpextrq $1, %xmm1, %rax
-; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
-; VLNODQ-NEXT: vmovq %xmm1, %rax
-; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; VLNODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; VLNODQ-NEXT: vmovq %xmm2, %rax
-; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; VLNODQ-NEXT: vpextrq $1, %xmm2, %rax
-; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
-; VLNODQ-NEXT: vmovq %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; VLNODQ-NEXT: vmovq %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
-; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; VLNODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; VLNODQ-NEXT: retq
-;
-; AVX512BW-LABEL: slto8f32:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; AVX512BW-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm2, %xmm2
-; AVX512BW-NEXT: vmovq %xmm1, %rax
-; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm1
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; AVX512BW-NEXT: vmovq %xmm2, %rax
-; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; AVX512BW-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm2
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm3
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm0
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; AVX512BW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-NEXT: retq
%b = sitofp <8 x i64> %a to <8 x float>
ret <8 x float> %b
}
@@ -1386,278 +1020,106 @@ define <8 x double> @slto8f64(<8 x i64>
}
define <16 x double> @slto16f64(<16 x i64> %a) {
-; KNL-LABEL: slto16f64:
-; KNL: # BB#0:
-; KNL-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; KNL-NEXT: vpextrq $1, %xmm2, %rax
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3
-; KNL-NEXT: vmovq %xmm2, %rax
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
-; KNL-NEXT: vextracti32x4 $2, %zmm0, %xmm4
-; KNL-NEXT: vpextrq $1, %xmm4, %rax
-; KNL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
-; KNL-NEXT: vmovq %xmm4, %rax
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
-; KNL-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; KNL-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm3
-; KNL-NEXT: vpextrq $1, %xmm3, %rax
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
-; KNL-NEXT: vmovq %xmm3, %rax
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm0
-; KNL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; KNL-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; KNL-NEXT: vextracti32x4 $3, %zmm1, %xmm3
-; KNL-NEXT: vpextrq $1, %xmm3, %rax
-; KNL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm2
-; KNL-NEXT: vmovq %xmm3, %rax
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
-; KNL-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; KNL-NEXT: vextracti32x4 $2, %zmm1, %xmm3
-; KNL-NEXT: vpextrq $1, %xmm3, %rax
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
-; KNL-NEXT: vmovq %xmm3, %rax
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
-; KNL-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; KNL-NEXT: vextracti128 $1, %ymm1, %xmm3
-; KNL-NEXT: vpextrq $1, %xmm3, %rax
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
-; KNL-NEXT: vmovq %xmm3, %rax
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
-; KNL-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; KNL-NEXT: vpextrq $1, %xmm1, %rax
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
-; KNL-NEXT: vmovq %xmm1, %rax
-; KNL-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm1
-; KNL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; KNL-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; KNL-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
-; KNL-NEXT: retq
+; NODQ-LABEL: slto16f64:
+; NODQ: # BB#0:
+; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NODQ-NEXT: vpextrq $1, %xmm2, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3
+; NODQ-NEXT: vmovq %xmm2, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3
+; NODQ-NEXT: vpextrq $1, %xmm3, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm4
+; NODQ-NEXT: vmovq %xmm3, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm3
+; NODQ-NEXT: vpextrq $1, %xmm3, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vmovq %xmm3, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm0
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
+; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm2
+; NODQ-NEXT: vpextrq $1, %xmm2, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vmovq %xmm2, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm2
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm3
+; NODQ-NEXT: vpextrq $1, %xmm3, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vmovq %xmm3, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm3
+; NODQ-NEXT: vpextrq $1, %xmm3, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vmovq %xmm3, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT: vpextrq $1, %xmm1, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
+; NODQ-NEXT: vmovq %xmm1, %rax
+; NODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm1
+; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
+; NODQ-NEXT: retq
;
; DQ-LABEL: slto16f64:
; DQ: # BB#0:
; DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
; DQ-NEXT: vcvtqq2pd %zmm1, %zmm1
; DQ-NEXT: retq
-;
-; VLNODQ-LABEL: slto16f64:
-; VLNODQ: # BB#0:
-; VLNODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; VLNODQ-NEXT: vpextrq $1, %xmm2, %rax
-; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3
-; VLNODQ-NEXT: vmovq %xmm2, %rax
-; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
-; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; VLNODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3
-; VLNODQ-NEXT: vpextrq $1, %xmm3, %rax
-; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm4
-; VLNODQ-NEXT: vmovq %xmm3, %rax
-; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
-; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; VLNODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm3
-; VLNODQ-NEXT: vpextrq $1, %xmm3, %rax
-; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
-; VLNODQ-NEXT: vmovq %xmm3, %rax
-; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
-; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
-; VLNODQ-NEXT: vmovq %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm0
-; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; VLNODQ-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; VLNODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; VLNODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm2
-; VLNODQ-NEXT: vpextrq $1, %xmm2, %rax
-; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
-; VLNODQ-NEXT: vmovq %xmm2, %rax
-; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm2
-; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; VLNODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm3
-; VLNODQ-NEXT: vpextrq $1, %xmm3, %rax
-; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
-; VLNODQ-NEXT: vmovq %xmm3, %rax
-; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
-; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; VLNODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; VLNODQ-NEXT: vextracti128 $1, %ymm1, %xmm3
-; VLNODQ-NEXT: vpextrq $1, %xmm3, %rax
-; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
-; VLNODQ-NEXT: vmovq %xmm3, %rax
-; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
-; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; VLNODQ-NEXT: vpextrq $1, %xmm1, %rax
-; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
-; VLNODQ-NEXT: vmovq %xmm1, %rax
-; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm1
-; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; VLNODQ-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; VLNODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
-; VLNODQ-NEXT: retq
-;
-; AVX512BW-LABEL: slto16f64:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; AVX512BW-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm3, %xmm3
-; AVX512BW-NEXT: vmovq %xmm2, %rax
-; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm2
-; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm3
-; AVX512BW-NEXT: vpextrq $1, %xmm3, %rax
-; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm4, %xmm4
-; AVX512BW-NEXT: vmovq %xmm3, %rax
-; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
-; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX512BW-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512BW-NEXT: vpextrq $1, %xmm3, %rax
-; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
-; AVX512BW-NEXT: vmovq %xmm3, %rax
-; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
-; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm0
-; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; AVX512BW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX512BW-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm2
-; AVX512BW-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
-; AVX512BW-NEXT: vmovq %xmm2, %rax
-; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm2
-; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, %xmm3
-; AVX512BW-NEXT: vpextrq $1, %xmm3, %rax
-; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
-; AVX512BW-NEXT: vmovq %xmm3, %rax
-; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
-; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX512BW-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX512BW-NEXT: vpextrq $1, %xmm3, %rax
-; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
-; AVX512BW-NEXT: vmovq %xmm3, %rax
-; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm3
-; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX512BW-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm4
-; AVX512BW-NEXT: vmovq %xmm1, %rax
-; AVX512BW-NEXT: vcvtsi2sdq %rax, %xmm5, %xmm1
-; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; AVX512BW-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX512BW-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512BW-NEXT: retq
%b = sitofp <16 x i64> %a to <16 x double>
ret <16 x double> %b
}
define <8 x float> @ulto8f32(<8 x i64> %a) {
-; KNL-LABEL: ulto8f32:
-; KNL: # BB#0:
-; KNL-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; KNL-NEXT: vpextrq $1, %xmm1, %rax
-; KNL-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
-; KNL-NEXT: vmovq %xmm1, %rax
-; KNL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1
-; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; KNL-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; KNL-NEXT: vmovq %xmm2, %rax
-; KNL-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3
-; KNL-NEXT: vpextrq $1, %xmm2, %rax
-; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; KNL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; KNL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3
-; KNL-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vmovq %xmm0, %rax
-; KNL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm3[0],xmm2[3]
-; KNL-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2
-; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
-; KNL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT: retq
+; NODQ-LABEL: ulto8f32:
+; NODQ: # BB#0:
+; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; NODQ-NEXT: vpextrq $1, %xmm1, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
+; NODQ-NEXT: vmovq %xmm1, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
+; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NODQ-NEXT: vmovq %xmm2, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
+; NODQ-NEXT: vpextrq $1, %xmm2, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2
+; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3
+; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
+; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
+; NODQ-NEXT: vmovq %xmm0, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3
+; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
+; NODQ-NEXT: vpextrq $1, %xmm0, %rax
+; NODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm0
+; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
+; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; NODQ-NEXT: retq
;
; DQ-LABEL: ulto8f32:
; DQ: # BB#0:
; DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
; DQ-NEXT: retq
-;
-; VLNODQ-LABEL: ulto8f32:
-; VLNODQ: # BB#0:
-; VLNODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; VLNODQ-NEXT: vpextrq $1, %xmm1, %rax
-; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
-; VLNODQ-NEXT: vmovq %xmm1, %rax
-; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; VLNODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; VLNODQ-NEXT: vmovq %xmm2, %rax
-; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; VLNODQ-NEXT: vpextrq $1, %xmm2, %rax
-; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2
-; VLNODQ-NEXT: vmovq %xmm0, %rax
-; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; VLNODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; VLNODQ-NEXT: vmovq %xmm0, %rax
-; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
-; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm0
-; VLNODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; VLNODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; VLNODQ-NEXT: retq
-;
-; AVX512BW-LABEL: ulto8f32:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; AVX512BW-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm2, %xmm2
-; AVX512BW-NEXT: vmovq %xmm1, %rax
-; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm1
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; AVX512BW-NEXT: vmovq %xmm2, %rax
-; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm3, %xmm3
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; AVX512BW-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm2
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm3
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
-; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512BW-NEXT: vcvtusi2ssq %rax, %xmm4, %xmm0
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; AVX512BW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-NEXT: retq
%b = uitofp <8 x i64> %a to <8 x float>
ret <8 x float> %b
}
@@ -1852,33 +1314,18 @@ define <8 x float> @uito8f32(<8 x i32> %
}
define <4 x float> @uito4f32(<4 x i32> %a) nounwind {
-; KNL-LABEL: uito4f32:
-; KNL: # BB#0:
-; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; KNL-NEXT: vcvtudq2ps %zmm0, %zmm0
-; KNL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; KNL-NEXT: retq
+; NOVL-LABEL: uito4f32:
+; NOVL: # BB#0:
+; NOVL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; NOVL-NEXT: vcvtudq2ps %zmm0, %zmm0
+; NOVL-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; NOVL-NEXT: vzeroupper
+; NOVL-NEXT: retq
;
; VL-LABEL: uito4f32:
; VL: # BB#0:
; VL-NEXT: vcvtudq2ps %xmm0, %xmm0
; VL-NEXT: retq
-;
-; AVX512DQ-LABEL: uito4f32:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: uito4f32:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512BW-NEXT: vcvtudq2ps %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
%b = uitofp <4 x i32> %a to <4 x float>
ret <4 x float> %b
}
@@ -2510,19 +1957,19 @@ define <4 x double> @ubto4f64(<4 x i32>
}
define <2 x float> @ubto2f32(<2 x i32> %a) {
-; KNL-LABEL: ubto2f32:
-; KNL: # BB#0:
-; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; KNL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
-; KNL-NEXT: vpextrb $8, %xmm0, %eax
-; KNL-NEXT: andl $1, %eax
-; KNL-NEXT: vpextrb $0, %xmm0, %ecx
-; KNL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
-; KNL-NEXT: andl $1, %ecx
-; KNL-NEXT: vcvtsi2ssl %ecx, %xmm2, %xmm1
-; KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; KNL-NEXT: retq
+; NOVL-LABEL: ubto2f32:
+; NOVL: # BB#0:
+; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; NOVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; NOVL-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; NOVL-NEXT: vpextrb $8, %xmm0, %eax
+; NOVL-NEXT: andl $1, %eax
+; NOVL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm1
+; NOVL-NEXT: vpextrb $0, %xmm0, %eax
+; NOVL-NEXT: andl $1, %eax
+; NOVL-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
+; NOVL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; NOVL-NEXT: retq
;
; VL-LABEL: ubto2f32:
; VL: # BB#0:
@@ -2532,34 +1979,6 @@ define <2 x float> @ubto2f32(<2 x i32> %
; VL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z}
; VL-NEXT: vcvtudq2ps %xmm0, %xmm0
; VL-NEXT: retq
-;
-; AVX512DQ-LABEL: ubto2f32:
-; AVX512DQ: # BB#0:
-; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512DQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
-; AVX512DQ-NEXT: vpextrb $8, %xmm0, %eax
-; AVX512DQ-NEXT: andl $1, %eax
-; AVX512DQ-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm1
-; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512DQ-NEXT: andl $1, %eax
-; AVX512DQ-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
-; AVX512DQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: ubto2f32:
-; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512BW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
-; AVX512BW-NEXT: vpextrb $8, %xmm0, %eax
-; AVX512BW-NEXT: andl $1, %eax
-; AVX512BW-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm1
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT: andl $1, %eax
-; AVX512BW-NEXT: vcvtsi2ssl %eax, %xmm2, %xmm0
-; AVX512BW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; AVX512BW-NEXT: retq
%mask = icmp ult <2 x i32> %a, zeroinitializer
%1 = uitofp <2 x i1> %mask to <2 x float>
ret <2 x float> %1
Modified: llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll?rev=314306&r1=314305&r2=314306&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll Wed Sep 27 07:44:15 2017
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck --check-prefix=CHECK --check-prefix=KNL %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=CHECK --check-prefix=SKX --check-prefix=SKX_ONLY %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -mattr=avx512vbmi | FileCheck --check-prefix=CHECK --check-prefix=SKX --check-prefix=SKX_VBMI %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck --check-prefix=CHECK --check-prefix=KNL %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck --check-prefix=CHECK --check-prefix=SKX --check-prefix=SKX_ONLY %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+avx512vbmi | FileCheck --check-prefix=CHECK --check-prefix=SKX --check-prefix=SKX_VBMI %s
define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
; CHECK-LABEL: test1:
@@ -12,7 +12,6 @@ define <16 x float> @test1(<16 x float>
; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%rrr = load float, float* %br
%rrr2 = insertelement <16 x float> %x, float %rrr, i32 1
%rrr3 = insertelement <16 x float> %rrr2, float %y, i32 14
@@ -28,7 +27,6 @@ define <8 x double> @test2(<8 x double>
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%rrr = load double, double* %br
%rrr2 = insertelement <8 x double> %x, double %rrr, i32 1
%rrr3 = insertelement <8 x double> %rrr2, double %y, i32 6
@@ -42,7 +40,6 @@ define <16 x float> @test3(<16 x float>
; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
; CHECK-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%eee = extractelement <16 x float> %x, i32 4
%rrr2 = insertelement <16 x float> %x, float %eee, i32 1
ret <16 x float> %rrr2
@@ -56,7 +53,6 @@ define <8 x i64> @test4(<8 x i64> %x) no
; CHECK-NEXT: vpinsrq $1, %rax, %xmm0, %xmm1
; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%eee = extractelement <8 x i64> %x, i32 4
%rrr2 = insertelement <8 x i64> %x, i64 %eee, i32 1
ret <8 x i64> %rrr2
@@ -67,7 +63,6 @@ define i32 @test5(<4 x float> %x) nounwi
; CHECK: ## BB#0:
; CHECK-NEXT: vextractps $3, %xmm0, %eax
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%ef = extractelement <4 x float> %x, i32 3
%ei = bitcast float %ef to i32
ret i32 %ei
@@ -78,148 +73,83 @@ define void @test6(<4 x float> %x, float
; CHECK: ## BB#0:
; CHECK-NEXT: vextractps $3, %xmm0, (%rdi)
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%ef = extractelement <4 x float> %x, i32 3
store float %ef, float* %out, align 4
ret void
}
define float @test7(<16 x float> %x, i32 %ind) nounwind {
-; KNL-LABEL: test7:
-; KNL: ## BB#0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: andq $-64, %rsp
-; KNL-NEXT: subq $128, %rsp
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: vmovaps %zmm0, (%rsp)
-; KNL-NEXT: andl $15, %edi
-; KNL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
-; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
-;
-; SKX-LABEL: test7:
-; SKX: ## BB#0:
-; SKX-NEXT: pushq %rbp
-; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: andq $-64, %rsp
-; SKX-NEXT: subq $128, %rsp
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT: vmovaps %zmm0, (%rsp)
-; SKX-NEXT: andl $15, %edi
-; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SKX-NEXT: movq %rbp, %rsp
-; SKX-NEXT: popq %rbp
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
+; CHECK-LABEL: test7:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: andq $-64, %rsp
+; CHECK-NEXT: subq $128, %rsp
+; CHECK-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; CHECK-NEXT: vmovaps %zmm0, (%rsp)
+; CHECK-NEXT: andl $15, %edi
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%e = extractelement <16 x float> %x, i32 %ind
ret float %e
}
define double @test8(<8 x double> %x, i32 %ind) nounwind {
-; KNL-LABEL: test8:
-; KNL: ## BB#0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: andq $-64, %rsp
-; KNL-NEXT: subq $128, %rsp
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: vmovaps %zmm0, (%rsp)
-; KNL-NEXT: andl $7, %edi
-; KNL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
-; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
-;
-; SKX-LABEL: test8:
-; SKX: ## BB#0:
-; SKX-NEXT: pushq %rbp
-; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: andq $-64, %rsp
-; SKX-NEXT: subq $128, %rsp
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT: vmovaps %zmm0, (%rsp)
-; SKX-NEXT: andl $7, %edi
-; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; SKX-NEXT: movq %rbp, %rsp
-; SKX-NEXT: popq %rbp
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
+; CHECK-LABEL: test8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: andq $-64, %rsp
+; CHECK-NEXT: subq $128, %rsp
+; CHECK-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; CHECK-NEXT: vmovaps %zmm0, (%rsp)
+; CHECK-NEXT: andl $7, %edi
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%e = extractelement <8 x double> %x, i32 %ind
ret double %e
}
define float @test9(<8 x float> %x, i32 %ind) nounwind {
-; KNL-LABEL: test9:
-; KNL: ## BB#0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: andq $-32, %rsp
-; KNL-NEXT: subq $64, %rsp
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: vmovaps %ymm0, (%rsp)
-; KNL-NEXT: andl $7, %edi
-; KNL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
-; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
-;
-; SKX-LABEL: test9:
-; SKX: ## BB#0:
-; SKX-NEXT: pushq %rbp
-; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: andq $-32, %rsp
-; SKX-NEXT: subq $64, %rsp
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT: vmovaps %ymm0, (%rsp)
-; SKX-NEXT: andl $7, %edi
-; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SKX-NEXT: movq %rbp, %rsp
-; SKX-NEXT: popq %rbp
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
+; CHECK-LABEL: test9:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: andq $-32, %rsp
+; CHECK-NEXT: subq $64, %rsp
+; CHECK-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; CHECK-NEXT: vmovaps %ymm0, (%rsp)
+; CHECK-NEXT: andl $7, %edi
+; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%e = extractelement <8 x float> %x, i32 %ind
ret float %e
}
define i32 @test10(<16 x i32> %x, i32 %ind) nounwind {
-; KNL-LABEL: test10:
-; KNL: ## BB#0:
-; KNL-NEXT: pushq %rbp
-; KNL-NEXT: movq %rsp, %rbp
-; KNL-NEXT: andq $-64, %rsp
-; KNL-NEXT: subq $128, %rsp
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: vmovaps %zmm0, (%rsp)
-; KNL-NEXT: andl $15, %edi
-; KNL-NEXT: movl (%rsp,%rdi,4), %eax
-; KNL-NEXT: movq %rbp, %rsp
-; KNL-NEXT: popq %rbp
-; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
-;
-; SKX-LABEL: test10:
-; SKX: ## BB#0:
-; SKX-NEXT: pushq %rbp
-; SKX-NEXT: movq %rsp, %rbp
-; SKX-NEXT: andq $-64, %rsp
-; SKX-NEXT: subq $128, %rsp
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT: vmovaps %zmm0, (%rsp)
-; SKX-NEXT: andl $15, %edi
-; SKX-NEXT: movl (%rsp,%rdi,4), %eax
-; SKX-NEXT: movq %rbp, %rsp
-; SKX-NEXT: popq %rbp
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
+; CHECK-LABEL: test10:
+; CHECK: ## BB#0:
+; CHECK-NEXT: pushq %rbp
+; CHECK-NEXT: movq %rsp, %rbp
+; CHECK-NEXT: andq $-64, %rsp
+; CHECK-NEXT: subq $128, %rsp
+; CHECK-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; CHECK-NEXT: vmovaps %zmm0, (%rsp)
+; CHECK-NEXT: andl $15, %edi
+; CHECK-NEXT: movl (%rsp,%rdi,4), %eax
+; CHECK-NEXT: movq %rbp, %rsp
+; CHECK-NEXT: popq %rbp
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%e = extractelement <16 x i32> %x, i32 %ind
ret i32 %e
}
@@ -274,6 +204,7 @@ define i64 @test12(<16 x i64>%a, <16 x i
; KNL-NEXT: testb $1, %al
; KNL-NEXT: cmoveq %rsi, %rdi
; KNL-NEXT: movq %rdi, %rax
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test12:
@@ -339,6 +270,7 @@ define i64 @test14(<8 x i64>%a, <8 x i64
; KNL-NEXT: testb $1, %al
; KNL-NEXT: cmoveq %rsi, %rdi
; KNL-NEXT: movq %rdi, %rax
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test14:
@@ -387,6 +319,7 @@ define i16 @test16(i1 *%addr, i16 %a) {
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test16:
@@ -424,6 +357,7 @@ define i8 @test17(i1 *%addr, i8 %a) {
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test17:
@@ -448,20 +382,13 @@ define i8 @test17(i1 *%addr, i8 %a) {
}
define i64 @extract_v8i64(<8 x i64> %x, i64* %dst) {
-; KNL-LABEL: extract_v8i64:
-; KNL: ## BB#0:
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vpextrq $1, %xmm0, (%rdi)
-; KNL-NEXT: retq
-;
-; SKX-LABEL: extract_v8i64:
-; SKX: ## BB#0:
-; SKX-NEXT: vpextrq $1, %xmm0, %rax
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT: vpextrq $1, %xmm0, (%rdi)
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: extract_v8i64:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpextrq $1, %xmm0, %rax
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpextrq $1, %xmm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%r1 = extractelement <8 x i64> %x, i32 1
%r2 = extractelement <8 x i64> %x, i32 3
store i64 %r2, i64* %dst, align 1
@@ -469,20 +396,13 @@ define i64 @extract_v8i64(<8 x i64> %x,
}
define i64 @extract_v4i64(<4 x i64> %x, i64* %dst) {
-; KNL-LABEL: extract_v4i64:
-; KNL: ## BB#0:
-; KNL-NEXT: vpextrq $1, %xmm0, %rax
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vpextrq $1, %xmm0, (%rdi)
-; KNL-NEXT: retq
-;
-; SKX-LABEL: extract_v4i64:
-; SKX: ## BB#0:
-; SKX-NEXT: vpextrq $1, %xmm0, %rax
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT: vpextrq $1, %xmm0, (%rdi)
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: extract_v4i64:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpextrq $1, %xmm0, %rax
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpextrq $1, %xmm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%r1 = extractelement <4 x i64> %x, i32 1
%r2 = extractelement <4 x i64> %x, i32 3
store i64 %r2, i64* %dst, align 1
@@ -502,20 +422,13 @@ define i64 @extract_v2i64(<2 x i64> %x,
}
define i32 @extract_v16i32(<16 x i32> %x, i32* %dst) {
-; KNL-LABEL: extract_v16i32:
-; KNL: ## BB#0:
-; KNL-NEXT: vpextrd $1, %xmm0, %eax
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vpextrd $1, %xmm0, (%rdi)
-; KNL-NEXT: retq
-;
-; SKX-LABEL: extract_v16i32:
-; SKX: ## BB#0:
-; SKX-NEXT: vpextrd $1, %xmm0, %eax
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT: vpextrd $1, %xmm0, (%rdi)
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: extract_v16i32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpextrd $1, %xmm0, %eax
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpextrd $1, %xmm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%r1 = extractelement <16 x i32> %x, i32 1
%r2 = extractelement <16 x i32> %x, i32 5
store i32 %r2, i32* %dst, align 1
@@ -523,20 +436,13 @@ define i32 @extract_v16i32(<16 x i32> %x
}
define i32 @extract_v8i32(<8 x i32> %x, i32* %dst) {
-; KNL-LABEL: extract_v8i32:
-; KNL: ## BB#0:
-; KNL-NEXT: vpextrd $1, %xmm0, %eax
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vpextrd $1, %xmm0, (%rdi)
-; KNL-NEXT: retq
-;
-; SKX-LABEL: extract_v8i32:
-; SKX: ## BB#0:
-; SKX-NEXT: vpextrd $1, %xmm0, %eax
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT: vpextrd $1, %xmm0, (%rdi)
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: extract_v8i32:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpextrd $1, %xmm0, %eax
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpextrd $1, %xmm0, (%rdi)
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%r1 = extractelement <8 x i32> %x, i32 1
%r2 = extractelement <8 x i32> %x, i32 5
store i32 %r2, i32* %dst, align 1
@@ -556,22 +462,14 @@ define i32 @extract_v4i32(<4 x i32> %x,
}
define i16 @extract_v32i16(<32 x i16> %x, i16* %dst) {
-; KNL-LABEL: extract_v32i16:
-; KNL: ## BB#0:
-; KNL-NEXT: vpextrw $1, %xmm0, %eax
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vpextrw $1, %xmm0, (%rdi)
-; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; KNL-NEXT: retq
-;
-; SKX-LABEL: extract_v32i16:
-; SKX: ## BB#0:
-; SKX-NEXT: vpextrw $1, %xmm0, %eax
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT: vpextrw $1, %xmm0, (%rdi)
-; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: extract_v32i16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpextrw $1, %xmm0, %eax
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpextrw $1, %xmm0, (%rdi)
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%r1 = extractelement <32 x i16> %x, i32 1
%r2 = extractelement <32 x i16> %x, i32 9
store i16 %r2, i16* %dst, align 1
@@ -579,22 +477,14 @@ define i16 @extract_v32i16(<32 x i16> %x
}
define i16 @extract_v16i16(<16 x i16> %x, i16* %dst) {
-; KNL-LABEL: extract_v16i16:
-; KNL: ## BB#0:
-; KNL-NEXT: vpextrw $1, %xmm0, %eax
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vpextrw $1, %xmm0, (%rdi)
-; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; KNL-NEXT: retq
-;
-; SKX-LABEL: extract_v16i16:
-; SKX: ## BB#0:
-; SKX-NEXT: vpextrw $1, %xmm0, %eax
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT: vpextrw $1, %xmm0, (%rdi)
-; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: extract_v16i16:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpextrw $1, %xmm0, %eax
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpextrw $1, %xmm0, (%rdi)
+; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%r1 = extractelement <16 x i16> %x, i32 1
%r2 = extractelement <16 x i16> %x, i32 9
store i16 %r2, i16* %dst, align 1
@@ -615,22 +505,14 @@ define i16 @extract_v8i16(<8 x i16> %x,
}
define i8 @extract_v64i8(<64 x i8> %x, i8* %dst) {
-; KNL-LABEL: extract_v64i8:
-; KNL: ## BB#0:
-; KNL-NEXT: vpextrb $1, %xmm0, %eax
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vpextrb $1, %xmm0, (%rdi)
-; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; KNL-NEXT: retq
-;
-; SKX-LABEL: extract_v64i8:
-; SKX: ## BB#0:
-; SKX-NEXT: vpextrb $1, %xmm0, %eax
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT: vpextrb $1, %xmm0, (%rdi)
-; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: extract_v64i8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpextrb $1, %xmm0, %eax
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpextrb $1, %xmm0, (%rdi)
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%r1 = extractelement <64 x i8> %x, i32 1
%r2 = extractelement <64 x i8> %x, i32 17
store i8 %r2, i8* %dst, align 1
@@ -638,22 +520,14 @@ define i8 @extract_v64i8(<64 x i8> %x, i
}
define i8 @extract_v32i8(<32 x i8> %x, i8* %dst) {
-; KNL-LABEL: extract_v32i8:
-; KNL: ## BB#0:
-; KNL-NEXT: vpextrb $1, %xmm0, %eax
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; KNL-NEXT: vpextrb $1, %xmm0, (%rdi)
-; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; KNL-NEXT: retq
-;
-; SKX-LABEL: extract_v32i8:
-; SKX: ## BB#0:
-; SKX-NEXT: vpextrb $1, %xmm0, %eax
-; SKX-NEXT: vextracti128 $1, %ymm0, %xmm0
-; SKX-NEXT: vpextrb $1, %xmm0, (%rdi)
-; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
+; CHECK-LABEL: extract_v32i8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpextrb $1, %xmm0, %eax
+; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0
+; CHECK-NEXT: vpextrb $1, %xmm0, (%rdi)
+; CHECK-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%r1 = extractelement <32 x i8> %x, i32 1
%r2 = extractelement <32 x i8> %x, i32 17
store i8 %r2, i8* %dst, align 1
@@ -936,6 +810,7 @@ define i32 @test_insertelement_v32i1(i32
; KNL-NEXT: subq $32, %rsp
; KNL-NEXT: xorl %eax, %eax
; KNL-NEXT: cmpl %esi, %edi
+; KNL-NEXT: setb %al
; KNL-NEXT: vpcmpltud %zmm3, %zmm1, %k0
; KNL-NEXT: kshiftlw $14, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
@@ -1061,7 +936,6 @@ define i32 @test_insertelement_v32i1(i32
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; KNL-NEXT: setb %al
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0
@@ -1083,6 +957,7 @@ define i32 @test_insertelement_v32i1(i32
; KNL-NEXT: movl (%rsp), %eax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_insertelement_v32i1:
@@ -1112,23 +987,23 @@ define i8 @test_iinsertelement_v4i1(i32
; KNL-LABEL: test_iinsertelement_v4i1:
; KNL: ## BB#0:
; KNL-NEXT: cmpl %esi, %edi
+; KNL-NEXT: setb %al
; KNL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0
; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1
; KNL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; KNL-NEXT: vpextrb $4, %xmm0, %eax
-; KNL-NEXT: setb %cl
-; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: vpextrb $4, %xmm0, %ecx
+; KNL-NEXT: kmovw %ecx, %k1
; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT: vpextrb $0, %xmm0, %eax
-; KNL-NEXT: kmovw %eax, %k1
+; KNL-NEXT: vpextrb $0, %xmm0, %ecx
+; KNL-NEXT: kmovw %ecx, %k1
; KNL-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7]
; KNL-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
; KNL-NEXT: vpsllq $63, %zmm3, %zmm1
; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1
; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; KNL-NEXT: kmovw %ecx, %k1
+; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7]
; KNL-NEXT: vpermi2q %zmm2, %zmm1, %zmm3
@@ -1144,6 +1019,7 @@ define i8 @test_iinsertelement_v4i1(i32
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_iinsertelement_v4i1:
@@ -1188,6 +1064,7 @@ define i8 @test_iinsertelement_v2i1(i32
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_iinsertelement_v2i1:
@@ -1308,6 +1185,7 @@ define zeroext i8 @test_extractelement_v
; KNL-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
; KNL-NEXT: vpextrb $2, %xmm0, %eax
; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_v32i1:
@@ -1338,6 +1216,7 @@ define zeroext i8 @test_extractelement_v
; KNL-NEXT: movb $4, %cl
; KNL-NEXT: subb %al, %cl
; KNL-NEXT: movzbl %cl, %eax
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_v64i1:
@@ -1370,6 +1249,7 @@ define zeroext i8 @extractelement_v64i1_
; KNL-NEXT: movb $4, %cl
; KNL-NEXT: subb %al, %cl
; KNL-NEXT: movzbl %cl, %eax
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: extractelement_v64i1_alt:
@@ -1421,6 +1301,7 @@ define i64 @test_extractelement_variable
; KNL-NEXT: movq (%rsp,%rdi,8), %rax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_variable_v4i64:
@@ -1466,6 +1347,7 @@ define i64 @test_extractelement_variable
; KNL-NEXT: movq (%rsp,%rdi,8), %rax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_variable_v8i64:
@@ -1523,6 +1405,7 @@ define double @test_extractelement_varia
; KNL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_variable_v4f64:
@@ -1568,6 +1451,7 @@ define double @test_extractelement_varia
; KNL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_variable_v8f64:
@@ -1625,6 +1509,7 @@ define i32 @test_extractelement_variable
; KNL-NEXT: movl (%rsp,%rdi,4), %eax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_variable_v8i32:
@@ -1670,6 +1555,7 @@ define i32 @test_extractelement_variable
; KNL-NEXT: movl (%rsp,%rdi,4), %eax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_variable_v16i32:
@@ -1727,6 +1613,7 @@ define float @test_extractelement_variab
; KNL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_variable_v8f32:
@@ -1772,6 +1659,7 @@ define float @test_extractelement_variab
; KNL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_variable_v16f32:
@@ -1829,6 +1717,7 @@ define i16 @test_extractelement_variable
; KNL-NEXT: movzwl (%rsp,%rdi,2), %eax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_variable_v16i16:
@@ -1875,6 +1764,7 @@ define i16 @test_extractelement_variable
; KNL-NEXT: movzwl (%rsp,%rdi,2), %eax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_variable_v32i16:
@@ -1902,23 +1792,14 @@ define i16 @test_extractelement_variable
}
define i8 @test_extractelement_variable_v16i8(<16 x i8> %t1, i32 %index) {
-; KNL-LABEL: test_extractelement_variable_v16i8:
-; KNL: ## BB#0:
-; KNL-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; KNL-NEXT: andl $15, %edi
-; KNL-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
-; KNL-NEXT: movb (%rdi,%rax), %al
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_extractelement_variable_v16i8:
-; SKX: ## BB#0:
-; SKX-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
-; SKX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; SKX-NEXT: andl $15, %edi
-; SKX-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
-; SKX-NEXT: movb (%rdi,%rax), %al
-; SKX-NEXT: retq
+; CHECK-LABEL: test_extractelement_variable_v16i8:
+; CHECK: ## BB#0:
+; CHECK-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; CHECK-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT: andl $15, %edi
+; CHECK-NEXT: leaq -{{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT: movb (%rdi,%rax), %al
+; CHECK-NEXT: retq
%t2 = extractelement <16 x i8> %t1, i32 %index
ret i8 %t2
}
@@ -1936,13 +1817,14 @@ define i8 @test_extractelement_variable_
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $64, %rsp
-; KNL-NEXT: vmovaps %ymm0, (%rsp)
; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT: vmovaps %ymm0, (%rsp)
; KNL-NEXT: andl $31, %edi
; KNL-NEXT: movq %rsp, %rax
; KNL-NEXT: movb (%rdi,%rax), %al
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_variable_v32i8:
@@ -1984,14 +1866,15 @@ define i8 @test_extractelement_variable_
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-64, %rsp
; KNL-NEXT: subq $128, %rsp
+; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
; KNL-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp)
; KNL-NEXT: vmovaps %ymm0, (%rsp)
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
; KNL-NEXT: andl $63, %edi
; KNL-NEXT: movq %rsp, %rax
; KNL-NEXT: movb (%rdi,%rax), %al
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_variable_v64i8:
@@ -2042,6 +1925,7 @@ define i8 @test_extractelement_variable_
; KNL-NEXT: movb (%rax,%rcx), %al
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_variable_v64i8_indexi8:
@@ -2075,12 +1959,12 @@ define i8 @test_extractelement_variable_
define zeroext i8 @test_extractelement_varible_v2i1(<2 x i64> %a, <2 x i64> %b, i32 %index) {
; KNL-LABEL: test_extractelement_varible_v2i1:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
; KNL-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1
; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0
; KNL-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
; KNL-NEXT: andl $1, %edi
; KNL-NEXT: movl -24(%rsp,%rdi,8), %eax
; KNL-NEXT: andl $1, %eax
@@ -2105,12 +1989,12 @@ define zeroext i8 @test_extractelement_v
define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b, i32 %index) {
; KNL-LABEL: test_extractelement_varible_v4i1:
; KNL: ## BB#0:
+; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
; KNL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
; KNL-NEXT: vpxor %xmm2, %xmm1, %xmm1
; KNL-NEXT: vpxor %xmm2, %xmm0, %xmm0
; KNL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
; KNL-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
-; KNL-NEXT: ## kill: %EDI<def> %EDI<kill> %RDI<def>
; KNL-NEXT: andl $3, %edi
; KNL-NEXT: movl -24(%rsp,%rdi,4), %eax
; KNL-NEXT: andl $1, %eax
@@ -2156,6 +2040,7 @@ define zeroext i8 @test_extractelement_v
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_varible_v8i1:
@@ -2209,6 +2094,7 @@ define zeroext i8 @test_extractelement_v
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_varible_v16i1:
@@ -2265,6 +2151,7 @@ define zeroext i8 @test_extractelement_v
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_extractelement_varible_v32i1:
Modified: llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll?rev=314306&r1=314305&r2=314306&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll Wed Sep 27 07:44:15 2017
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
-; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
+; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mattr=+avx512bw | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512BW
; RUN: llc < %s -stack-symbol-ordering=0 -mtriple=x86_64-apple-darwin -mattr=+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512DQ
@@ -287,7 +287,6 @@ define i8 @shuf_test1(i16 %v) nounwind {
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: shuf_test1:
; SKX: ## BB#0:
@@ -296,7 +295,6 @@ define i8 @shuf_test1(i16 %v) nounwind {
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
;
; AVX512BW-LABEL: shuf_test1:
; AVX512BW: ## BB#0:
@@ -305,7 +303,6 @@ define i8 @shuf_test1(i16 %v) nounwind {
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512BW-NEXT: retq
-; AVX512BW-NEXT: ## -- End function
;
; AVX512DQ-LABEL: shuf_test1:
; AVX512DQ: ## BB#0:
@@ -314,7 +311,6 @@ define i8 @shuf_test1(i16 %v) nounwind {
; AVX512DQ-NEXT: kmovw %k0, %eax
; AVX512DQ-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
; AVX512DQ-NEXT: retq
-; AVX512DQ-NEXT: ## -- End function
%v1 = bitcast i16 %v to <16 x i1>
%mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%mask1 = bitcast <8 x i1> %mask to i8
@@ -329,6 +325,7 @@ define i32 @zext_test1(<16 x i32> %a, <1
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andl $1, %eax
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: zext_test1:
@@ -375,6 +372,7 @@ define i16 @zext_test2(<16 x i32> %a, <1
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andl $1, %eax
; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: zext_test2:
@@ -424,6 +422,7 @@ define i8 @zext_test3(<16 x i32> %a, <16
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andb $1, %al
; KNL-NEXT: ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: zext_test3:
@@ -516,6 +515,7 @@ define <4 x i32> @test4(<4 x i64> %x, <4
; KNL-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm1
; KNL-NEXT: vpmovqd %zmm1, %ymm1
; KNL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test4:
@@ -611,6 +611,7 @@ define void @test7(<8 x i1> %mask) {
; KNL-NEXT: korw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb %al, %al
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test7:
@@ -661,8 +662,8 @@ false:
define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
; KNL-LABEL: test8:
; KNL: ## BB#0:
-; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; KNL-NEXT: cmpl %esi, %edi
+; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; KNL-NEXT: jg LBB17_1
; KNL-NEXT: ## BB#2:
; KNL-NEXT: vpcmpltud %zmm2, %zmm1, %k1
@@ -672,12 +673,13 @@ define <16 x i8> @test8(<16 x i32>%a, <1
; KNL-NEXT: LBB17_3:
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test8:
; SKX: ## BB#0:
-; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: cmpl %esi, %edi
+; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; SKX-NEXT: jg LBB17_1
; SKX-NEXT: ## BB#2:
; SKX-NEXT: vpcmpltud %zmm2, %zmm1, %k0
@@ -743,6 +745,7 @@ define <16 x i1> @test9(<16 x i1>%a, <16
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test9:
@@ -883,6 +886,7 @@ define <16 x i1> @test15(i32 %x, i32 %y)
; KNL-NEXT: kmovw %ecx, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test15:
@@ -1189,6 +1193,7 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
; KNL-NEXT: korw %k1, %k0, %k1
; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovqw %zmm0, %xmm0
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test18:
@@ -1289,7 +1294,6 @@ define <32 x i16> @test21(<32 x i16> %x
; KNL-NEXT: vpsraw $15, %ymm2, %ymm2
; KNL-NEXT: vpand %ymm1, %ymm2, %ymm1
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test21:
; SKX: ## BB#0:
@@ -1297,7 +1301,6 @@ define <32 x i16> @test21(<32 x i16> %x
; SKX-NEXT: vpmovb2m %ymm1, %k1
; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
;
; AVX512BW-LABEL: test21:
; AVX512BW: ## BB#0:
@@ -1305,7 +1308,6 @@ define <32 x i16> @test21(<32 x i16> %x
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: retq
-; AVX512BW-NEXT: ## -- End function
;
; AVX512DQ-LABEL: test21:
; AVX512DQ: ## BB#0:
@@ -1319,7 +1321,6 @@ define <32 x i16> @test21(<32 x i16> %x
; AVX512DQ-NEXT: vpsraw $15, %ymm2, %ymm2
; AVX512DQ-NEXT: vpand %ymm1, %ymm2, %ymm1
; AVX512DQ-NEXT: retq
-; AVX512DQ-NEXT: ## -- End function
%ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
ret <32 x i16> %ret
}
@@ -1332,6 +1333,7 @@ define void @test22(<4 x i1> %a, <4 x i1
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test22:
@@ -1371,6 +1373,7 @@ define void @test23(<2 x i1> %a, <2 x i1
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test23:
@@ -1450,6 +1453,7 @@ define void @store_v2i1(<2 x i1> %c , <2
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_v2i1:
@@ -1494,6 +1498,7 @@ define void @store_v4i1(<4 x i1> %c , <4
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_v4i1:
@@ -1538,6 +1543,7 @@ define void @store_v8i1(<8 x i1> %c , <8
; KNL-NEXT: knotw %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_v8i1:
@@ -1580,6 +1586,7 @@ define void @store_v16i1(<16 x i1> %c ,
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: knotw %k0, %k0
; KNL-NEXT: kmovw %k0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_v16i1:
@@ -1738,9 +1745,11 @@ define void @ktest_1(<8 x double> %in, d
; KNL-NEXT: je LBB41_2
; KNL-NEXT: ## BB#1: ## %L1
; KNL-NEXT: vmovapd %zmm0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
; KNL-NEXT: LBB41_2: ## %L2
; KNL-NEXT: vmovapd %zmm0, 8(%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: ktest_1:
@@ -1835,73 +1844,9 @@ define void @ktest_2(<32 x float> %in, f
; KNL-NEXT: .cfi_def_cfa_register %rbp
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $32, %rsp
-; KNL-NEXT: vmovups 64(%rdi), %zmm2
-; KNL-NEXT: vcmpltps %zmm1, %zmm2, %k2
-; KNL-NEXT: kshiftlw $14, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: kshiftlw $15, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ecx
-; KNL-NEXT: vmovd %ecx, %xmm2
-; KNL-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftlw $13, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftlw $12, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftlw $11, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftlw $10, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftlw $9, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftlw $8, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftlw $7, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftlw $6, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftlw $5, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftlw $4, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftlw $3, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftlw $2, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftlw $1, %k2, %k0
-; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
-; KNL-NEXT: kshiftrw $15, %k2, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
-; KNL-NEXT: vmovups (%rdi), %zmm3
-; KNL-NEXT: vcmpltps %zmm0, %zmm3, %k1
+; KNL-NEXT: vmovups (%rdi), %zmm2
+; KNL-NEXT: vmovups 64(%rdi), %zmm3
+; KNL-NEXT: vcmpltps %zmm1, %zmm3, %k1
; KNL-NEXT: kshiftlw $14, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
@@ -1965,138 +1910,202 @@ define void @ktest_2(<32 x float> %in, f
; KNL-NEXT: kshiftrw $15, %k1, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
-; KNL-NEXT: vmovups 68(%rdi), %zmm4 {%k2} {z}
+; KNL-NEXT: vcmpltps %zmm0, %zmm2, %k2
+; KNL-NEXT: kshiftlw $14, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kshiftlw $15, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: vmovd %ecx, %xmm2
+; KNL-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $13, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $12, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $11, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $10, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $9, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $8, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $7, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $6, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $5, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $4, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $3, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $2, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftlw $1, %k2, %k0
+; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2
+; KNL-NEXT: kshiftrw $15, %k2, %k0
+; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
+; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; KNL-NEXT: vmovups 4(%rdi), %zmm3 {%k2} {z}
+; KNL-NEXT: vmovups 68(%rdi), %zmm4 {%k1} {z}
; KNL-NEXT: vcmpltps %zmm4, %zmm1, %k0
-; KNL-NEXT: kshiftlw $14, %k0, %k2
-; KNL-NEXT: kshiftrw $15, %k2, %k2
-; KNL-NEXT: kmovw %k2, %eax
-; KNL-NEXT: kshiftlw $15, %k0, %k2
-; KNL-NEXT: kshiftrw $15, %k2, %k2
-; KNL-NEXT: kmovw %k2, %ecx
+; KNL-NEXT: kshiftlw $14, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kshiftlw $15, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: vmovd %ecx, %xmm4
; KNL-NEXT: vpinsrb $1, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $13, %k0, %k2
-; KNL-NEXT: kshiftrw $15, %k2, %k2
-; KNL-NEXT: kmovw %k2, %eax
+; KNL-NEXT: kshiftlw $13, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $12, %k0, %k2
-; KNL-NEXT: kshiftrw $15, %k2, %k2
-; KNL-NEXT: kmovw %k2, %eax
+; KNL-NEXT: kshiftlw $12, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: vpinsrb $3, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $11, %k0, %k2
-; KNL-NEXT: kshiftrw $15, %k2, %k2
-; KNL-NEXT: kmovw %k2, %eax
+; KNL-NEXT: kshiftlw $11, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $10, %k0, %k2
-; KNL-NEXT: kshiftrw $15, %k2, %k2
-; KNL-NEXT: kmovw %k2, %eax
+; KNL-NEXT: kshiftlw $10, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: vpinsrb $5, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $9, %k0, %k2
-; KNL-NEXT: kshiftrw $15, %k2, %k2
-; KNL-NEXT: kmovw %k2, %eax
+; KNL-NEXT: kshiftlw $9, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $8, %k0, %k2
-; KNL-NEXT: kshiftrw $15, %k2, %k2
-; KNL-NEXT: kmovw %k2, %eax
+; KNL-NEXT: kshiftlw $8, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: vpinsrb $7, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $7, %k0, %k2
-; KNL-NEXT: kshiftrw $15, %k2, %k2
-; KNL-NEXT: kmovw %k2, %eax
+; KNL-NEXT: kshiftlw $7, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: vpinsrb $8, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $6, %k0, %k2
-; KNL-NEXT: kshiftrw $15, %k2, %k2
-; KNL-NEXT: kmovw %k2, %eax
+; KNL-NEXT: kshiftlw $6, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: vpinsrb $9, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $5, %k0, %k2
-; KNL-NEXT: kshiftrw $15, %k2, %k2
-; KNL-NEXT: kmovw %k2, %eax
+; KNL-NEXT: kshiftlw $5, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: vpinsrb $10, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $4, %k0, %k2
-; KNL-NEXT: kshiftrw $15, %k2, %k2
-; KNL-NEXT: kmovw %k2, %eax
+; KNL-NEXT: kshiftlw $4, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: vpinsrb $11, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $3, %k0, %k2
-; KNL-NEXT: kshiftrw $15, %k2, %k2
-; KNL-NEXT: kmovw %k2, %eax
+; KNL-NEXT: kshiftlw $3, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: vpinsrb $12, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $2, %k0, %k2
-; KNL-NEXT: kshiftrw $15, %k2, %k2
-; KNL-NEXT: kmovw %k2, %eax
+; KNL-NEXT: kshiftlw $2, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: vpinsrb $13, %eax, %xmm4, %xmm4
-; KNL-NEXT: kshiftlw $1, %k0, %k2
-; KNL-NEXT: kshiftrw $15, %k2, %k2
-; KNL-NEXT: kmovw %k2, %eax
+; KNL-NEXT: kshiftlw $1, %k0, %k1
+; KNL-NEXT: kshiftrw $15, %k1, %k1
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: vpinsrb $14, %eax, %xmm4, %xmm4
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vpinsrb $15, %eax, %xmm4, %xmm4
-; KNL-NEXT: vmovups 4(%rdi), %zmm5 {%k1} {z}
-; KNL-NEXT: vcmpltps %zmm5, %zmm0, %k0
+; KNL-NEXT: vcmpltps %zmm3, %zmm0, %k0
; KNL-NEXT: kshiftlw $14, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $15, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vmovd %ecx, %xmm5
-; KNL-NEXT: vpinsrb $1, %eax, %xmm5, %xmm5
+; KNL-NEXT: vmovd %ecx, %xmm3
+; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftlw $13, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $2, %eax, %xmm5, %xmm5
+; KNL-NEXT: vpinsrb $2, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftlw $12, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $3, %eax, %xmm5, %xmm5
+; KNL-NEXT: vpinsrb $3, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftlw $11, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $4, %eax, %xmm5, %xmm5
+; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftlw $10, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm5, %xmm5
+; KNL-NEXT: vpinsrb $5, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftlw $9, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $6, %eax, %xmm5, %xmm5
+; KNL-NEXT: vpinsrb $6, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftlw $8, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $7, %eax, %xmm5, %xmm5
+; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftlw $7, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $8, %eax, %xmm5, %xmm5
+; KNL-NEXT: vpinsrb $8, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftlw $6, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $9, %eax, %xmm5, %xmm5
+; KNL-NEXT: vpinsrb $9, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftlw $5, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5
+; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftlw $4, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $11, %eax, %xmm5, %xmm5
+; KNL-NEXT: vpinsrb $11, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftlw $3, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5
+; KNL-NEXT: vpinsrb $12, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftlw $2, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $13, %eax, %xmm5, %xmm5
+; KNL-NEXT: vpinsrb $13, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftlw $1, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5
+; KNL-NEXT: vpinsrb $14, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm5, %xmm5
-; KNL-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
-; KNL-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm3
+; KNL-NEXT: vpinsrb $15, %eax, %xmm3, %xmm3
+; KNL-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
; KNL-NEXT: vpor %ymm3, %ymm2, %ymm2
; KNL-NEXT: vextracti128 $1, %ymm2, %xmm3
; KNL-NEXT: vpmovsxbd %xmm3, %zmm3
@@ -2119,6 +2128,7 @@ define void @ktest_2(<32 x float> %in, f
; KNL-NEXT: LBB42_3: ## %End
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: ktest_2:
@@ -2565,6 +2575,7 @@ define <2 x i16> @load_2i1(<2 x i1>* %a)
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: load_2i1:
@@ -2602,6 +2613,7 @@ define <4 x i16> @load_4i1(<4 x i1>* %a)
; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: load_4i1:
@@ -2730,6 +2742,7 @@ define void @store_8i1(<8 x i1>* %a, <8
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_8i1:
@@ -2768,6 +2781,7 @@ define void @store_8i1_1(<8 x i1>* %a, <
; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movb %al, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_8i1_1:
@@ -2806,6 +2820,7 @@ define void @store_16i1(<16 x i1>* %a, <
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_16i1:
@@ -2847,6 +2862,7 @@ define void @store_32i1(<32 x i1>* %a, <
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_32i1:
@@ -2897,6 +2913,7 @@ define void @store_32i1_1(<32 x i1>* %a,
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, (%rdi)
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_32i1_1:
@@ -2941,6 +2958,36 @@ define void @store_64i1(<64 x i1>* %a, <
;
; KNL-LABEL: store_64i1:
; KNL: ## BB#0:
+; KNL-NEXT: pushq %rbp
+; KNL-NEXT: Lcfi9:
+; KNL-NEXT: .cfi_def_cfa_offset 16
+; KNL-NEXT: pushq %r15
+; KNL-NEXT: Lcfi10:
+; KNL-NEXT: .cfi_def_cfa_offset 24
+; KNL-NEXT: pushq %r14
+; KNL-NEXT: Lcfi11:
+; KNL-NEXT: .cfi_def_cfa_offset 32
+; KNL-NEXT: pushq %r13
+; KNL-NEXT: Lcfi12:
+; KNL-NEXT: .cfi_def_cfa_offset 40
+; KNL-NEXT: pushq %r12
+; KNL-NEXT: Lcfi13:
+; KNL-NEXT: .cfi_def_cfa_offset 48
+; KNL-NEXT: pushq %rbx
+; KNL-NEXT: Lcfi14:
+; KNL-NEXT: .cfi_def_cfa_offset 56
+; KNL-NEXT: Lcfi15:
+; KNL-NEXT: .cfi_offset %rbx, -56
+; KNL-NEXT: Lcfi16:
+; KNL-NEXT: .cfi_offset %r12, -48
+; KNL-NEXT: Lcfi17:
+; KNL-NEXT: .cfi_offset %r13, -40
+; KNL-NEXT: Lcfi18:
+; KNL-NEXT: .cfi_offset %r14, -32
+; KNL-NEXT: Lcfi19:
+; KNL-NEXT: .cfi_offset %r15, -24
+; KNL-NEXT: Lcfi20:
+; KNL-NEXT: .cfi_offset %rbp, -16
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -2952,66 +2999,66 @@ define void @store_64i1(<64 x i1>* %a, <
; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0
; KNL-NEXT: kshiftlw $14, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kmovw %k1, %r8d
; KNL-NEXT: kshiftlw $15, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: kmovw %k1, %r9d
; KNL-NEXT: kshiftlw $13, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %edx
+; KNL-NEXT: kmovw %k1, %r10d
; KNL-NEXT: kshiftlw $12, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vmovd %ecx, %xmm3
-; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: kmovw %k1, %r11d
; KNL-NEXT: kshiftlw $11, %k0, %k1
-; KNL-NEXT: vpinsrb $1, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kmovw %k1, %r14d
; KNL-NEXT: kshiftlw $10, %k0, %k1
-; KNL-NEXT: vpinsrb $2, %edx, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %edx
+; KNL-NEXT: kmovw %k1, %r15d
; KNL-NEXT: kshiftlw $9, %k0, %k1
-; KNL-NEXT: vpinsrb $3, %ecx, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: kmovw %k1, %r12d
; KNL-NEXT: kshiftlw $8, %k0, %k1
-; KNL-NEXT: vpinsrb $4, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kmovw %k1, %r13d
; KNL-NEXT: kshiftlw $7, %k0, %k1
-; KNL-NEXT: vpinsrb $5, %edx, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %edx
+; KNL-NEXT: kmovw %k1, %ebx
; KNL-NEXT: kshiftlw $6, %k0, %k1
-; KNL-NEXT: vpinsrb $6, %ecx, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: kmovw %k1, %ebp
; KNL-NEXT: kshiftlw $5, %k0, %k1
-; KNL-NEXT: vpinsrb $7, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $4, %k0, %k1
-; KNL-NEXT: vpinsrb $8, %edx, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %edx
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $3, %k0, %k1
-; KNL-NEXT: vpinsrb $9, %ecx, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: kmovw %k1, %edx
; KNL-NEXT: kshiftlw $2, %k0, %k1
-; KNL-NEXT: vpinsrb $10, %eax, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kmovw %k1, %esi
; KNL-NEXT: kshiftlw $1, %k0, %k1
-; KNL-NEXT: vpinsrb $11, %edx, %xmm3, %xmm3
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %edx
+; KNL-NEXT: vmovd %r9d, %xmm3
+; KNL-NEXT: kmovw %k1, %r9d
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k2
-; KNL-NEXT: vpinsrb $12, %ecx, %xmm3, %xmm2
-; KNL-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2
-; KNL-NEXT: vpinsrb $14, %edx, %xmm2, %xmm2
; KNL-NEXT: kshiftrw $15, %k0, %k0
+; KNL-NEXT: vpinsrb $1, %r8d, %xmm3, %xmm2
+; KNL-NEXT: vpinsrb $2, %r10d, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $3, %r11d, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $4, %r14d, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $5, %r15d, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $6, %r12d, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $7, %r13d, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $8, %ebx, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $9, %ebp, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $11, %ecx, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $12, %edx, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $13, %esi, %xmm2, %xmm2
+; KNL-NEXT: vpinsrb $14, %r9d, %xmm2, %xmm2
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vpinsrb $15, %eax, %xmm2, %xmm2
; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
@@ -3020,66 +3067,66 @@ define void @store_64i1(<64 x i1>* %a, <
; KNL-NEXT: kmovw %k0, 6(%rdi)
; KNL-NEXT: kshiftlw $14, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kmovw %k0, %r8d
; KNL-NEXT: kshiftlw $15, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kmovw %k0, %r10d
; KNL-NEXT: kshiftlw $13, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: kmovw %k0, %r9d
; KNL-NEXT: kshiftlw $12, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vmovd %ecx, %xmm2
-; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kmovw %k0, %r11d
; KNL-NEXT: kshiftlw $11, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $1, %eax, %xmm2, %xmm2
-; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kmovw %k0, %r14d
; KNL-NEXT: kshiftlw $10, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $2, %edx, %xmm2, %xmm2
-; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: kmovw %k0, %r15d
; KNL-NEXT: kshiftlw $9, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $3, %ecx, %xmm2, %xmm2
-; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kmovw %k0, %r12d
; KNL-NEXT: kshiftlw $8, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2
-; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kmovw %k0, %r13d
; KNL-NEXT: kshiftlw $7, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $5, %edx, %xmm2, %xmm2
-; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: kshiftlw $6, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $6, %ecx, %xmm2, %xmm2
-; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kmovw %k0, %esi
; KNL-NEXT: kshiftlw $5, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2
-; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kmovw %k0, %ebp
; KNL-NEXT: kshiftlw $4, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $8, %edx, %xmm2, %xmm2
-; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: kmovw %k0, %ebx
; KNL-NEXT: kshiftlw $3, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $9, %ecx, %xmm2, %xmm2
-; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: kshiftlw $2, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2
-; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kmovw %k0, %edx
; KNL-NEXT: kshiftlw $1, %k2, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $11, %edx, %xmm2, %xmm2
-; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: vmovd %r10d, %xmm2
+; KNL-NEXT: kmovw %k0, %r10d
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL-NEXT: kshiftrw $15, %k2, %k0
-; KNL-NEXT: vpinsrb $12, %ecx, %xmm2, %xmm1
-; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
-; KNL-NEXT: vpinsrb $14, %edx, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $1, %r8d, %xmm2, %xmm1
+; KNL-NEXT: vpinsrb $2, %r9d, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $3, %r11d, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $4, %r14d, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $5, %r15d, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $6, %r12d, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $7, %r13d, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $9, %esi, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $10, %ebp, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $11, %ebx, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $13, %edx, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $14, %r10d, %xmm1, %xmm1
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
@@ -3088,139 +3135,146 @@ define void @store_64i1(<64 x i1>* %a, <
; KNL-NEXT: kmovw %k0, 4(%rdi)
; KNL-NEXT: kshiftlw $14, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kmovw %k0, %r8d
; KNL-NEXT: kshiftlw $15, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kmovw %k0, %r10d
; KNL-NEXT: kshiftlw $13, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: kmovw %k0, %r9d
; KNL-NEXT: kshiftlw $12, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vmovd %ecx, %xmm1
-; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kmovw %k0, %r11d
; KNL-NEXT: kshiftlw $11, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
-; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kmovw %k0, %r14d
; KNL-NEXT: kshiftlw $10, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $2, %edx, %xmm1, %xmm1
-; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: kmovw %k0, %r15d
; KNL-NEXT: kshiftlw $9, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
-; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kmovw %k0, %r12d
; KNL-NEXT: kshiftlw $8, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
-; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kmovw %k0, %r13d
; KNL-NEXT: kshiftlw $7, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $5, %edx, %xmm1, %xmm1
-; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: kshiftlw $6, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1
-; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kmovw %k0, %esi
; KNL-NEXT: kshiftlw $5, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
-; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kmovw %k0, %ebp
; KNL-NEXT: kshiftlw $4, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $8, %edx, %xmm1, %xmm1
-; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: kmovw %k0, %ebx
; KNL-NEXT: kshiftlw $3, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $9, %ecx, %xmm1, %xmm1
-; KNL-NEXT: kmovw %k0, %ecx
+; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: kshiftlw $2, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
-; KNL-NEXT: kmovw %k0, %eax
+; KNL-NEXT: kmovw %k0, %edx
; KNL-NEXT: kshiftlw $1, %k1, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpinsrb $11, %edx, %xmm1, %xmm1
-; KNL-NEXT: kmovw %k0, %edx
+; KNL-NEXT: vmovd %r10d, %xmm1
+; KNL-NEXT: kmovw %k0, %r10d
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $12, %ecx, %xmm1, %xmm0
-; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: vpinsrb $1, %r8d, %xmm1, %xmm0
+; KNL-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $14, %r10d, %xmm0, %xmm0
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
+; KNL-NEXT: kmovw %k1, 2(%rdi)
; KNL-NEXT: kshiftlw $14, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kmovw %k1, %r8d
; KNL-NEXT: kshiftlw $15, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $14, %edx, %xmm0, %xmm0
-; KNL-NEXT: kmovw %k1, %edx
+; KNL-NEXT: kmovw %k1, %r9d
; KNL-NEXT: kshiftlw $13, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0
-; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: kmovw %k1, %r10d
; KNL-NEXT: kshiftlw $12, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: vmovd %edx, %xmm1
-; KNL-NEXT: kmovw %k1, %edx
+; KNL-NEXT: kmovw %k1, %r11d
; KNL-NEXT: kshiftlw $11, %k0, %k1
-; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kmovw %k1, %r14d
; KNL-NEXT: kshiftlw $10, %k0, %k1
-; KNL-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: kmovw %k1, %r15d
; KNL-NEXT: kshiftlw $9, %k0, %k1
-; KNL-NEXT: vpinsrb $3, %edx, %xmm1, %xmm1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %edx
+; KNL-NEXT: kmovw %k1, %r12d
; KNL-NEXT: kshiftlw $8, %k0, %k1
-; KNL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kmovw %k1, %r13d
; KNL-NEXT: kshiftlw $7, %k0, %k1
-; KNL-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: kmovw %k1, %edx
; KNL-NEXT: kshiftlw $6, %k0, %k1
-; KNL-NEXT: vpinsrb $6, %edx, %xmm1, %xmm1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %edx
+; KNL-NEXT: kmovw %k1, %esi
; KNL-NEXT: kshiftlw $5, %k0, %k1
-; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kmovw %k1, %ebp
; KNL-NEXT: kshiftlw $4, %k0, %k1
-; KNL-NEXT: vpinsrb $8, %ecx, %xmm1, %xmm1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
+; KNL-NEXT: kmovw %k1, %ebx
; KNL-NEXT: kshiftlw $3, %k0, %k1
-; KNL-NEXT: vpinsrb $9, %edx, %xmm1, %xmm1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %edx
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $2, %k0, %k1
-; KNL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: kshiftlw $1, %k0, %k1
-; KNL-NEXT: vpinsrb $11, %ecx, %xmm1, %xmm1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vpinsrb $12, %edx, %xmm1, %xmm1
+; KNL-NEXT: vmovd %r9d, %xmm0
+; KNL-NEXT: kmovw %k1, %r9d
+; KNL-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $2, %r10d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $8, %edx, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $9, %esi, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $10, %ebp, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $11, %ebx, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftrw $15, %k0, %k0
-; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
-; KNL-NEXT: vpslld $31, %zmm0, %zmm0
-; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; KNL-NEXT: vpinsrb $14, %r9d, %xmm0, %xmm0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; KNL-NEXT: vpinsrb $14, %ecx, %xmm1, %xmm0
; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; KNL-NEXT: kmovw %k0, 2(%rdi)
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, (%rdi)
+; KNL-NEXT: popq %rbx
+; KNL-NEXT: popq %r12
+; KNL-NEXT: popq %r13
+; KNL-NEXT: popq %r14
+; KNL-NEXT: popq %r15
+; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: store_64i1:
@@ -3571,6 +3625,7 @@ define i32 @test_bitcast_v8i1_zext(<16 x
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: movzbl %al, %eax
; KNL-NEXT: addl %eax, %eax
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test_bitcast_v8i1_zext:
@@ -3609,40 +3664,14 @@ define i32 @test_bitcast_v8i1_zext(<16 x
}
define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) {
-; KNL-LABEL: test_bitcast_v16i1_zext:
-; KNL: ## BB#0:
-; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: addl %eax, %eax
-; KNL-NEXT: retq
-;
-; SKX-LABEL: test_bitcast_v16i1_zext:
-; SKX: ## BB#0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; SKX-NEXT: kmovw %k0, %eax
-; SKX-NEXT: addl %eax, %eax
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
-;
-; AVX512BW-LABEL: test_bitcast_v16i1_zext:
-; AVX512BW: ## BB#0:
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; AVX512BW-NEXT: kmovw %k0, %eax
-; AVX512BW-NEXT: addl %eax, %eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: test_bitcast_v16i1_zext:
-; AVX512DQ: ## BB#0:
-; AVX512DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
-; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: addl %eax, %eax
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
+; CHECK-LABEL: test_bitcast_v16i1_zext:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0
+; CHECK-NEXT: kmovw %k0, %eax
+; CHECK-NEXT: addl %eax, %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%v1 = icmp eq <16 x i32> %a, zeroinitializer
%mask1 = bitcast <16 x i1> %v1 to i16
%val = zext i16 %mask1 to i32
Modified: llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll?rev=314306&r1=314305&r2=314306&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll Wed Sep 27 07:44:15 2017
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind {
; CHECK-LABEL: test1:
@@ -8,7 +8,6 @@ define <16 x float> @test1(<16 x float>
; CHECK-NEXT: vcmpleps %zmm1, %zmm0, %k1
; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%mask = fcmp ole <16 x float> %x, %y
%max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y
ret <16 x float> %max
@@ -20,7 +19,6 @@ define <8 x double> @test2(<8 x double>
; CHECK-NEXT: vcmplepd %zmm1, %zmm0, %k1
; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%mask = fcmp ole <8 x double> %x, %y
%max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y
ret <8 x double> %max
@@ -32,7 +30,6 @@ define <16 x i32> @test3(<16 x i32> %x,
; CHECK-NEXT: vpcmpeqd (%rdi), %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%y = load <16 x i32>, <16 x i32>* %yp, align 4
%mask = icmp eq <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -45,7 +42,6 @@ define <16 x i32> @test4_unsigned(<16 x
; CHECK-NEXT: vpcmpnltud %zmm1, %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%mask = icmp uge <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
ret <16 x i32> %max
@@ -57,7 +53,6 @@ define <8 x i64> @test5(<8 x i64> %x, <8
; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1
; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%mask = icmp eq <8 x i64> %x, %y
%max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
ret <8 x i64> %max
@@ -69,7 +64,6 @@ define <8 x i64> @test6_unsigned(<8 x i6
; CHECK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1
; CHECK-NEXT: vpblendmq %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%mask = icmp ugt <8 x i64> %x, %y
%max = select <8 x i1> %mask, <8 x i64> %x1, <8 x i64> %y
ret <8 x i64> %max
@@ -123,14 +117,12 @@ define <8 x i32> @test9(<8 x i32> %x, <8
; KNL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test9:
; SKX: ## BB#0:
; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1
; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%mask = icmp eq <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
ret <8 x i32> %max
@@ -145,14 +137,12 @@ define <8 x float> @test10(<8 x float> %
; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test10:
; SKX: ## BB#0:
; SKX-NEXT: vcmpeqps %ymm1, %ymm0, %k1
; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%mask = fcmp oeq <8 x float> %x, %y
%max = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
@@ -164,7 +154,6 @@ define <8 x i32> @test11_unsigned(<8 x i
; CHECK: ## BB#0:
; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%mask = icmp ugt <8 x i32> %x, %y
%max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
ret <8 x i32> %max
@@ -178,8 +167,8 @@ define i16 @test12(<16 x i64> %a, <16 x
; KNL-NEXT: kunpckbw %k0, %k1, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test12:
; SKX: ## BB#0:
@@ -190,7 +179,6 @@ define i16 @test12(<16 x i64> %a, <16 x
; SKX-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%res = icmp eq <16 x i64> %a, %b
%res1 = bitcast <16 x i1> %res to i16
ret i16 %res1
@@ -269,6 +257,8 @@ define i32 @test12_v32i32(<32 x i32> %a,
; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
; KNL-NEXT: vpslld $31, %zmm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
; KNL-NEXT: vpcmpeqd %zmm2, %zmm0, %k0
; KNL-NEXT: kshiftlw $14, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
@@ -325,13 +315,11 @@ define i32 @test12_v32i32(<32 x i32> %a,
; KNL-NEXT: kshiftlw $2, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $1, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
-; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
-; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
-; KNL-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; KNL-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
@@ -342,8 +330,8 @@ define i32 @test12_v32i32(<32 x i32> %a,
; KNL-NEXT: movl (%rsp), %eax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test12_v32i32:
; SKX: ## BB#0:
@@ -353,7 +341,6 @@ define i32 @test12_v32i32(<32 x i32> %a,
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%res = icmp eq <32 x i32> %a, %b
%res1 = bitcast <32 x i1> %res to i32
ret i32 %res1
@@ -577,75 +564,75 @@ define i64 @test12_v64i16(<64 x i16> %a,
; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
-; KNL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm1
-; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
-; KNL-NEXT: vpslld $31, %zmm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpcmpeqw %ymm6, %ymm2, %ymm0
+; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kshiftlw $14, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftlw $15, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %ecx
-; KNL-NEXT: vmovd %ecx, %xmm1
-; KNL-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
+; KNL-NEXT: vmovd %ecx, %xmm0
+; KNL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $13, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $12, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $11, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $10, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $9, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $8, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $7, %eax, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $7, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $8, %eax, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $6, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $9, %eax, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $5, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $10, %eax, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $4, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $11, %eax, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $3, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $12, %eax, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $2, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $13, %eax, %xmm1, %xmm1
+; KNL-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftlw $1, %k0, %k1
; KNL-NEXT: kshiftrw $15, %k1, %k1
; KNL-NEXT: kmovw %k1, %eax
-; KNL-NEXT: vpinsrb $14, %eax, %xmm1, %xmm1
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
+; KNL-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0
-; KNL-NEXT: kmovw %k1, {{[0-9]+}}(%rsp)
+; KNL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
@@ -656,8 +643,8 @@ define i64 @test12_v64i16(<64 x i16> %a,
; KNL-NEXT: orq %rcx, %rax
; KNL-NEXT: movq %rbp, %rsp
; KNL-NEXT: popq %rbp
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test12_v64i16:
; SKX: ## BB#0:
@@ -667,7 +654,6 @@ define i64 @test12_v64i16(<64 x i16> %a,
; SKX-NEXT: kmovq %k0, %rax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%res = icmp eq <64 x i16> %a, %b
%res1 = bitcast <64 x i1> %res to i64
ret i64 %res1
@@ -721,7 +707,6 @@ define <16 x i32> @test16(<16 x i32> %x,
; CHECK-NEXT: vpcmpled %zmm0, %zmm1, %k1
; CHECK-NEXT: vpblendmd %zmm2, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%mask = icmp sge <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x1, <16 x i32> %y
ret <16 x i32> %max
@@ -733,7 +718,6 @@ define <16 x i32> @test17(<16 x i32> %x,
; CHECK-NEXT: vpcmpgtd (%rdi), %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask = icmp sgt <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -746,7 +730,6 @@ define <16 x i32> @test18(<16 x i32> %x,
; CHECK-NEXT: vpcmpled (%rdi), %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask = icmp sle <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -759,7 +742,6 @@ define <16 x i32> @test19(<16 x i32> %x,
; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask = icmp ule <16 x i32> %x, %y
%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
@@ -773,7 +755,6 @@ define <16 x i32> @test20(<16 x i32> %x,
; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 {%k1}
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%mask1 = icmp eq <16 x i32> %x1, %y1
%mask0 = icmp eq <16 x i32> %x, %y
%mask = select <16 x i1> %mask0, <16 x i1> %mask1, <16 x i1> zeroinitializer
@@ -788,7 +769,6 @@ define <8 x i64> @test21(<8 x i64> %x, <
; CHECK-NEXT: vpcmpleq %zmm2, %zmm3, %k1 {%k1}
; CHECK-NEXT: vpblendmq %zmm0, %zmm2, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%mask1 = icmp sge <8 x i64> %x1, %y1
%mask0 = icmp sle <8 x i64> %x, %y
%mask = select <8 x i1> %mask0, <8 x i1> %mask1, <8 x i1> zeroinitializer
@@ -803,7 +783,6 @@ define <8 x i64> @test22(<8 x i64> %x, <
; CHECK-NEXT: vpcmpgtq (%rdi), %zmm0, %k1 {%k1}
; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%mask1 = icmp sgt <8 x i64> %x1, %y1
%y = load <8 x i64>, <8 x i64>* %y.ptr, align 4
%mask0 = icmp sgt <8 x i64> %x, %y
@@ -819,7 +798,6 @@ define <16 x i32> @test23(<16 x i32> %x,
; CHECK-NEXT: vpcmpleud (%rdi), %zmm0, %k1 {%k1}
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%mask1 = icmp sge <16 x i32> %x1, %y1
%y = load <16 x i32>, <16 x i32>* %y.ptr, align 4
%mask0 = icmp ule <16 x i32> %x, %y
@@ -834,7 +812,6 @@ define <8 x i64> @test24(<8 x i64> %x, <
; CHECK-NEXT: vpcmpeqq (%rdi){1to8}, %zmm0, %k1
; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
%y = shufflevector <8 x i64> %y.0, <8 x i64> undef, <8 x i32> zeroinitializer
@@ -849,7 +826,6 @@ define <16 x i32> @test25(<16 x i32> %x,
; CHECK-NEXT: vpcmpled (%rdi){1to16}, %zmm0, %k1
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
%y = shufflevector <16 x i32> %y.0, <16 x i32> undef, <16 x i32> zeroinitializer
@@ -865,7 +841,6 @@ define <16 x i32> @test26(<16 x i32> %x,
; CHECK-NEXT: vpcmpgtd (%rdi){1to16}, %zmm0, %k1 {%k1}
; CHECK-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%mask1 = icmp sge <16 x i32> %x1, %y1
%yb = load i32, i32* %yb.ptr, align 4
%y.0 = insertelement <16 x i32> undef, i32 %yb, i32 0
@@ -883,7 +858,6 @@ define <8 x i64> @test27(<8 x i64> %x, i
; CHECK-NEXT: vpcmpleq (%rdi){1to8}, %zmm0, %k1 {%k1}
; CHECK-NEXT: vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%mask1 = icmp sge <8 x i64> %x1, %y1
%yb = load i64, i64* %yb.ptr, align 4
%y.0 = insertelement <8 x i64> undef, i64 %yb, i32 0
@@ -926,6 +900,7 @@ define <16 x i8>@test29(<16 x i32> %x, <
; KNL-NEXT: kxorw %k1, %k0, %k1
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vpmovdb %zmm0, %xmm0
+; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
; SKX-LABEL: test29:
@@ -949,14 +924,12 @@ define <4 x double> @test30(<4 x double>
; KNL-NEXT: vcmpeqpd %ymm1, %ymm0, %ymm2
; KNL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test30:
; SKX: ## BB#0:
; SKX-NEXT: vcmpeqpd %ymm1, %ymm0, %k1
; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%mask = fcmp oeq <4 x double> %x, %y
%max = select <4 x i1> %mask, <4 x double> %x, <4 x double> %y
@@ -969,14 +942,12 @@ define <2 x double> @test31(<2 x double>
; KNL-NEXT: vcmpltpd (%rdi), %xmm0, %xmm2
; KNL-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test31:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltpd (%rdi), %xmm0, %k1
; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%y = load <2 x double>, <2 x double>* %yp, align 4
%mask = fcmp olt <2 x double> %x, %y
@@ -990,14 +961,12 @@ define <4 x double> @test32(<4 x double>
; KNL-NEXT: vcmpltpd (%rdi), %ymm0, %ymm2
; KNL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test32:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltpd (%rdi), %ymm0, %k1
; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%y = load <4 x double>, <4 x double>* %yp, align 4
%mask = fcmp ogt <4 x double> %y, %x
@@ -1011,7 +980,6 @@ define <8 x double> @test33(<8 x double>
; CHECK-NEXT: vcmpltpd (%rdi), %zmm0, %k1
; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%y = load <8 x double>, <8 x double>* %yp, align 4
%mask = fcmp olt <8 x double> %x, %y
%max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %x1
@@ -1024,14 +992,12 @@ define <4 x float> @test34(<4 x float> %
; KNL-NEXT: vcmpltps (%rdi), %xmm0, %xmm2
; KNL-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test34:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltps (%rdi), %xmm0, %k1
; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%y = load <4 x float>, <4 x float>* %yp, align 4
%mask = fcmp olt <4 x float> %x, %y
%max = select <4 x i1> %mask, <4 x float> %x, <4 x float> %x1
@@ -1048,14 +1014,12 @@ define <8 x float> @test35(<8 x float> %
; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test35:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltps (%rdi), %ymm0, %k1
; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%y = load <8 x float>, <8 x float>* %yp, align 4
%mask = fcmp ogt <8 x float> %y, %x
@@ -1069,7 +1033,6 @@ define <16 x float> @test36(<16 x float>
; CHECK-NEXT: vcmpltps (%rdi), %zmm0, %k1
; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%y = load <16 x float>, <16 x float>* %yp, align 4
%mask = fcmp olt <16 x float> %x, %y
%max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %x1
@@ -1082,7 +1045,6 @@ define <8 x double> @test37(<8 x double>
; CHECK-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1
; CHECK-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%a = load double, double* %ptr
%v = insertelement <8 x double> undef, double %a, i32 0
@@ -1100,14 +1062,12 @@ define <4 x double> @test38(<4 x double>
; KNL-NEXT: vcmpltpd %ymm2, %ymm0, %ymm2
; KNL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test38:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltpd (%rdi){1to4}, %ymm0, %k1
; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%a = load double, double* %ptr
%v = insertelement <4 x double> undef, double %a, i32 0
@@ -1125,14 +1085,12 @@ define <2 x double> @test39(<2 x double>
; KNL-NEXT: vcmpltpd %xmm2, %xmm0, %xmm2
; KNL-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test39:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltpd (%rdi){1to2}, %xmm0, %k1
; SKX-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%a = load double, double* %ptr
%v = insertelement <2 x double> undef, double %a, i32 0
@@ -1150,7 +1108,6 @@ define <16 x float> @test40(<16 x floa
; CHECK-NEXT: vcmpltps (%rdi){1to16}, %zmm0, %k1
; CHECK-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; CHECK-NEXT: retq
-; CHECK-NEXT: ## -- End function
%a = load float, float* %ptr
%v = insertelement <16 x float> undef, float %a, i32 0
@@ -1171,14 +1128,12 @@ define <8 x float> @test41(<8 x float>
; KNL-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1}
; KNL-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test41:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltps (%rdi){1to8}, %ymm0, %k1
; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1}
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%a = load float, float* %ptr
%v = insertelement <8 x float> undef, float %a, i32 0
@@ -1196,14 +1151,12 @@ define <4 x float> @test42(<4 x float>
; KNL-NEXT: vcmpltps %xmm2, %xmm0, %xmm2
; KNL-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test42:
; SKX: ## BB#0:
; SKX-NEXT: vcmpltps (%rdi){1to4}, %xmm0, %k1
; SKX-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%a = load float, float* %ptr
%v = insertelement <4 x float> undef, float %a, i32 0
@@ -1223,7 +1176,6 @@ define <8 x double> @test43(<8 x double>
; KNL-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1}
; KNL-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; KNL-NEXT: retq
-; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test43:
; SKX: ## BB#0:
@@ -1232,7 +1184,6 @@ define <8 x double> @test43(<8 x double>
; SKX-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1}
; SKX-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
; SKX-NEXT: retq
-; SKX-NEXT: ## -- End function
%a = load double, double* %ptr
%v = insertelement <8 x double> undef, double %a, i32 0
Modified: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll?rev=314306&r1=314305&r2=314306&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll Wed Sep 27 07:44:15 2017
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
-; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32
declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64)
@@ -11,8 +11,8 @@ declare <64 x i8> @llvm.x86.avx512.mask.
; AVX512BW-NEXT: vpbroadcastb %edi, %zmm1
; AVX512BW-NEXT: kmovq %rsi, %k1
; AVX512BW-NEXT: vpbroadcastb %edi, %zmm0 {%k1}
-; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vpbroadcastb %edi, %zmm1 {%k1} {z}
+; AVX512BW-NEXT: vpbroadcastb %edi, %zmm2 {%k1} {z}
+; AVX512BW-NEXT: vpaddb %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
@@ -22,8 +22,8 @@ declare <64 x i8> @llvm.x86.avx512.mask.
; AVX512F-32-NEXT: vpbroadcastb %eax, %zmm1
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpbroadcastb %eax, %zmm0 {%k1}
-; AVX512F-32-NEXT: vpaddb %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastb %eax, %zmm1 {%k1} {z}
+; AVX512F-32-NEXT: vpbroadcastb %eax, %zmm2 {%k1} {z}
+; AVX512F-32-NEXT: vpaddb %zmm2, %zmm0, %zmm0
; AVX512F-32-NEXT: vpaddb %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%res = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 -1)
@@ -41,8 +41,8 @@ declare <32 x i16> @llvm.x86.avx512.mask
; AVX512BW-NEXT: vpbroadcastw %edi, %zmm1
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpbroadcastw %edi, %zmm0 {%k1}
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vpbroadcastw %edi, %zmm1 {%k1} {z}
+; AVX512BW-NEXT: vpbroadcastw %edi, %zmm2 {%k1} {z}
+; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
@@ -52,8 +52,8 @@ declare <32 x i16> @llvm.x86.avx512.mask
; AVX512F-32-NEXT: vpbroadcastw %eax, %zmm1
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpbroadcastw %eax, %zmm0 {%k1}
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
-; AVX512F-32-NEXT: vpbroadcastw %eax, %zmm1 {%k1} {z}
+; AVX512F-32-NEXT: vpbroadcastw %eax, %zmm2 {%k1} {z}
+; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0
; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 -1)
@@ -72,6 +72,7 @@ define void at test_int_x86_avx512_mask_sto
; AVX512BW-NEXT: kmovq %rdx, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_storeu_b_512:
@@ -81,6 +82,7 @@ define void at test_int_x86_avx512_mask_sto
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vmovdqu8 %zmm0, (%ecx) {%k1}
; AVX512F-32-NEXT: vmovdqu32 %zmm0, (%eax)
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
call void @llvm.x86.avx512.mask.storeu.b.512(i8* %ptr1, <64 x i8> %x1, i64 %x2)
call void @llvm.x86.avx512.mask.storeu.b.512(i8* %ptr2, <64 x i8> %x1, i64 -1)
@@ -95,6 +97,7 @@ define void at test_int_x86_avx512_mask_sto
; AVX512BW-NEXT: kmovd %edx, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1}
; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rsi)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_storeu_w_512:
@@ -104,6 +107,7 @@ define void at test_int_x86_avx512_mask_sto
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vmovdqu16 %zmm0, (%ecx) {%k1}
; AVX512F-32-NEXT: vmovdqu32 %zmm0, (%eax)
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr1, <32 x i16> %x1, i32 %x2)
call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr2, <32 x i16> %x1, i32 -1)
@@ -253,8 +257,8 @@ define <64 x i8>@test_int_x86_avx512_mas
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpalignr {{.*#+}} zmm2 {%k1} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
; AVX512BW-NEXT: vpalignr {{.*#+}} zmm0 {%k1} {z} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
-; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_palignr_512:
@@ -263,8 +267,8 @@ define <64 x i8>@test_int_x86_avx512_mas
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpalignr {{.*#+}} zmm2 {%k1} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
; AVX512F-32-NEXT: vpalignr {{.*#+}} zmm0 {%k1} {z} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49]
-; AVX512F-32-NEXT: vpaddb %zmm0, %zmm2, %zmm0
; AVX512F-32-NEXT: vpaddb %zmm3, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddb %zmm0, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%res = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 %x4)
%res1 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> zeroinitializer, i64 %x4)
@@ -283,8 +287,8 @@ define <32 x i16>@test_int_x86_avx512_ma
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
; AVX512BW-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pshufh_w_512:
@@ -293,8 +297,8 @@ define <32 x i16>@test_int_x86_avx512_ma
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
; AVX512F-32-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28]
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3)
@@ -313,8 +317,8 @@ define <32 x i16>@test_int_x86_avx512_ma
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
; AVX512BW-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pshufl_w_512:
@@ -323,8 +327,8 @@ define <32 x i16>@test_int_x86_avx512_ma
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
; AVX512F-32-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31]
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3)
@@ -339,6 +343,7 @@ define i64 @test_pcmpeq_b(<64 x i8> %a,
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_pcmpeq_b:
@@ -351,6 +356,7 @@ define i64 @test_pcmpeq_b(<64 x i8> %a,
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
ret i64 %res
@@ -362,6 +368,7 @@ define i64 @test_mask_pcmpeq_b(<64 x i8>
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_pcmpeq_b:
@@ -375,6 +382,7 @@ define i64 @test_mask_pcmpeq_b(<64 x i8>
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
ret i64 %res
@@ -387,12 +395,14 @@ define i32 @test_pcmpeq_w(<32 x i16> %a,
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_pcmpeq_w:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vpcmpeqw %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovd %k0, %eax
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
ret i32 %res
@@ -404,6 +414,7 @@ define i32 @test_mask_pcmpeq_w(<32 x i16
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_pcmpeq_w:
@@ -411,6 +422,7 @@ define i32 @test_mask_pcmpeq_w(<32 x i16
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovd %k0, %eax
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
ret i32 %res
@@ -423,6 +435,7 @@ define i64 @test_pcmpgt_b(<64 x i8> %a,
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_pcmpgt_b:
@@ -435,6 +448,7 @@ define i64 @test_pcmpgt_b(<64 x i8> %a,
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 -1)
ret i64 %res
@@ -446,6 +460,7 @@ define i64 @test_mask_pcmpgt_b(<64 x i8>
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1}
; AVX512BW-NEXT: kmovq %k0, %rax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_pcmpgt_b:
@@ -459,6 +474,7 @@ define i64 @test_mask_pcmpgt_b(<64 x i8>
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $12, %esp
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8> %a, <64 x i8> %b, i64 %mask)
ret i64 %res
@@ -471,12 +487,14 @@ define i32 @test_pcmpgt_w(<32 x i16> %a,
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_pcmpgt_w:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vpcmpgtw %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovd %k0, %eax
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 -1)
ret i32 %res
@@ -488,6 +506,7 @@ define i32 @test_mask_pcmpgt_w(<32 x i16
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
; AVX512BW-NEXT: kmovd %k0, %eax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_pcmpgt_w:
@@ -495,6 +514,7 @@ define i32 @test_mask_pcmpgt_w(<32 x i16
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1}
; AVX512F-32-NEXT: kmovd %k0, %eax
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res = call i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16> %a, <32 x i16> %b, i32 %mask)
ret i32 %res
@@ -799,8 +819,8 @@ define <32 x i16>@test_int_x86_avx512_ma
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovzxb_w_512:
@@ -809,8 +829,8 @@ define <32 x i16>@test_int_x86_avx512_ma
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpmovzxbw {{.*#+}} zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512F-32-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> zeroinitializer, i32 %x2)
@@ -829,8 +849,8 @@ define <32 x i16>@test_int_x86_avx512_ma
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1 {%k1}
; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pmovsxb_w_512:
@@ -839,8 +859,8 @@ define <32 x i16>@test_int_x86_avx512_ma
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpmovsxbw %ymm0, %zmm1 {%k1}
; AVX512F-32-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> zeroinitializer, i32 %x2)
@@ -858,8 +878,8 @@ define <32 x i16>@test_int_x86_avx512_ma
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm3, %zmm0
; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
@@ -868,8 +888,8 @@ define <32 x i16>@test_int_x86_avx512_ma
; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm3
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1}
-; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2
; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm3, %zmm0
; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
@@ -888,8 +908,8 @@ define <32 x i16>@test_int_x86_avx512_ma
; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm2
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1
; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
@@ -898,8 +918,8 @@ define <32 x i16>@test_int_x86_avx512_ma
; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm2
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1}
-; AVX512F-32-NEXT: vpaddw %zmm2, %zmm1, %zmm1
; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
@@ -919,8 +939,8 @@ define <32 x i16>@test_int_x86_avx512_ma
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vpsraw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: vpaddw %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_psra_w_512:
@@ -929,8 +949,8 @@ define <32 x i16>@test_int_x86_avx512_ma
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsraw %xmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vpsraw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512F-32-NEXT: vpaddw %zmm3, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
@@ -949,8 +969,8 @@ define <32 x i16>@test_int_x86_avx512_ma
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpsraw $3, %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vpsraw $3, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_psra_wi_512:
@@ -959,8 +979,8 @@ define <32 x i16>@test_int_x86_avx512_ma
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsraw $3, %zmm0, %zmm1 {%k1}
; AVX512F-32-NEXT: vpsraw $3, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3)
@@ -979,8 +999,8 @@ define <32 x i16>@test_int_x86_avx512_ma
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: vpaddw %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_psll_w_512:
@@ -989,8 +1009,8 @@ define <32 x i16>@test_int_x86_avx512_ma
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsllw %xmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vpsllw %xmm1, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512F-32-NEXT: vpaddw %zmm3, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3)
@@ -1009,8 +1029,8 @@ define <32 x i16>@test_int_x86_avx512_ma
; AVX512BW-NEXT: kmovd %esi, %k1
; AVX512BW-NEXT: vpsllw $3, %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vpsllw $3, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_psll_wi_512:
@@ -1019,8 +1039,8 @@ define <32 x i16>@test_int_x86_avx512_ma
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpsllw $3, %zmm0, %zmm1 {%k1}
; AVX512F-32-NEXT: vpsllw $3, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: vpaddw %zmm2, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3)
@@ -1650,6 +1670,7 @@ define i64 @test_cmp_b_512(<64 x i8> %a0
; AVX512BW-NEXT: kxnorq %k0, %k0, %k0
; AVX512BW-NEXT: kmovq %k0, %rax
; AVX512BW-NEXT: addq %rcx, %rax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_cmp_b_512:
@@ -1664,28 +1685,29 @@ define i64 @test_cmp_b_512(<64 x i8> %a0
; AVX512F-32-NEXT: vpcmpgtb %zmm0, %zmm1, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpleb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpleb %zmm0, %zmm1, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, (%esp)
; AVX512F-32-NEXT: addl (%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: kxnorq %k0, %k0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $60, %esp
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
%res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
@@ -1730,33 +1752,26 @@ define i64 @test_mask_cmp_b_512(<64 x i8
; AVX512BW-NEXT: kmovq %k0, %rax
; AVX512BW-NEXT: addq %rcx, %rax
; AVX512BW-NEXT: addq %rdi, %rax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_cmp_b_512:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: pushl %ebp
+; AVX512F-32-NEXT: pushl %ebx
; AVX512F-32-NEXT: .Lcfi5:
; AVX512F-32-NEXT: .cfi_def_cfa_offset 8
-; AVX512F-32-NEXT: pushl %ebx
+; AVX512F-32-NEXT: pushl %esi
; AVX512F-32-NEXT: .Lcfi6:
; AVX512F-32-NEXT: .cfi_def_cfa_offset 12
-; AVX512F-32-NEXT: pushl %edi
+; AVX512F-32-NEXT: subl $60, %esp
; AVX512F-32-NEXT: .Lcfi7:
-; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
-; AVX512F-32-NEXT: pushl %esi
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 72
; AVX512F-32-NEXT: .Lcfi8:
-; AVX512F-32-NEXT: .cfi_def_cfa_offset 20
-; AVX512F-32-NEXT: subl $60, %esp
+; AVX512F-32-NEXT: .cfi_offset %esi, -12
; AVX512F-32-NEXT: .Lcfi9:
-; AVX512F-32-NEXT: .cfi_def_cfa_offset 80
-; AVX512F-32-NEXT: .Lcfi10:
-; AVX512F-32-NEXT: .cfi_offset %esi, -20
-; AVX512F-32-NEXT: .Lcfi11:
-; AVX512F-32-NEXT: .cfi_offset %edi, -16
-; AVX512F-32-NEXT: .Lcfi12:
-; AVX512F-32-NEXT: .cfi_offset %ebx, -12
-; AVX512F-32-NEXT: .Lcfi13:
-; AVX512F-32-NEXT: .cfi_offset %ebp, -8
+; AVX512F-32-NEXT: .cfi_offset %ebx, -8
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512F-32-NEXT: vmovdqa64 %zmm0, %zmm5
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrb $5, %al
@@ -1777,39 +1792,39 @@ define i64 @test_mask_cmp_b_512(<64 x i8
; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
; AVX512F-32-NEXT: kmovd %ecx, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: kmovd %edx, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
; AVX512F-32-NEXT: vpslld $24, %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: kmovd %ebx, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: kmovd %eax, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
; AVX512F-32-NEXT: vpsllq $40, %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrb $6, %al
@@ -1818,8 +1833,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -1828,8 +1843,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %al
@@ -1837,8 +1852,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: andb $2, %al
@@ -1847,8 +1862,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %dl
@@ -1859,8 +1874,8 @@ define i64 @test_mask_cmp_b_512(<64 x i8
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %al
@@ -1868,631 +1883,639 @@ define i64 @test_mask_cmp_b_512(<64 x i8
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl $1036, %edi # imm = 0x40C
-; AVX512F-32-NEXT: bextrl %edi, %ecx, %eax
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: shrl $12, %eax
+; AVX512F-32-NEXT: andl $15, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: movl %ecx, %ebp
; AVX512F-32-NEXT: shrl $13, %eax
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl $526, %edx # imm = 0x20E
-; AVX512F-32-NEXT: bextrl %edx, %ebp, %eax
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: shrl $14, %eax
+; AVX512F-32-NEXT: andl $3, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: movl $271, %esi # imm = 0x10F
-; AVX512F-32-NEXT: bextrl %esi, %ebp, %eax
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: shrl $15, %eax
+; AVX512F-32-NEXT: andl $1, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: shrl $16, %ecx
-; AVX512F-32-NEXT: kmovd %ecx, %k1
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: shrl $16, %eax
+; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: andb $2, %al
-; AVX512F-32-NEXT: shrb %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: movl %eax, %edx
+; AVX512F-32-NEXT: andb $2, %dl
+; AVX512F-32-NEXT: shrb %dl
+; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ecx, %ebx
+; AVX512F-32-NEXT: movl %eax, %ebx
; AVX512F-32-NEXT: andb $15, %bl
-; AVX512F-32-NEXT: movl %ebx, %eax
+; AVX512F-32-NEXT: movl %ebx, %edx
; AVX512F-32-NEXT: shrb $2, %bl
; AVX512F-32-NEXT: kmovd %ebx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: shrb $3, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: shrb $3, %dl
+; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrb $4, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: movl %eax, %edx
+; AVX512F-32-NEXT: shrb $4, %dl
+; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrb $5, %al
-; AVX512F-32-NEXT: andb $1, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: movl %eax, %edx
+; AVX512F-32-NEXT: shrb $5, %dl
+; AVX512F-32-NEXT: andb $1, %dl
+; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrb $6, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: movl %eax, %edx
+; AVX512F-32-NEXT: shrb $6, %dl
+; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill> %ECX<def>
-; AVX512F-32-NEXT: shrb $7, %cl
-; AVX512F-32-NEXT: kmovd %ecx, %k1
+; AVX512F-32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> %EAX<def>
+; AVX512F-32-NEXT: shrb $7, %al
+; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ebp, %ebx
-; AVX512F-32-NEXT: shrl $24, %ebx
-; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm3
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm3, %k0
-; AVX512F-32-NEXT: movl %ebx, %eax
-; AVX512F-32-NEXT: andb $2, %al
-; AVX512F-32-NEXT: shrb %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm5
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm4, %k0
-; AVX512F-32-NEXT: # kill: %BL<def> %BL<kill> %EBX<kill> %EBX<def>
-; AVX512F-32-NEXT: andb $15, %bl
-; AVX512F-32-NEXT: movl %ebx, %eax
-; AVX512F-32-NEXT: shrb $2, %bl
-; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4
-; AVX512F-32-NEXT: vpbroadcastw %xmm4, %xmm4
-; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm5
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm6
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm6, %ymm5, %ymm5
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0
-; AVX512F-32-NEXT: shrb $3, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm7
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0
-; AVX512F-32-NEXT: movl %ebp, %ecx
; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrl $28, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5
-; AVX512F-32-NEXT: vpbroadcastd %xmm5, %xmm5
-; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm7
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm7, %ymm5, %ymm5
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrl $29, %eax
-; AVX512F-32-NEXT: andb $1, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm7
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm2, %ymm7, %ymm7
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ebp, %eax
-; AVX512F-32-NEXT: shrl $30, %eax
+; AVX512F-32-NEXT: shrl $24, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ebp, %eax
-; AVX512F-32-NEXT: shrl $31, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: movl %eax, %edx
+; AVX512F-32-NEXT: andb $2, %dl
+; AVX512F-32-NEXT: shrb %dl
+; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ebx, %eax
-; AVX512F-32-NEXT: andb $2, %al
-; AVX512F-32-NEXT: shrb %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ebx, %ecx
-; AVX512F-32-NEXT: andb $15, %cl
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrb $2, %cl
-; AVX512F-32-NEXT: kmovd %ecx, %k1
+; AVX512F-32-NEXT: movl %eax, %edx
+; AVX512F-32-NEXT: andb $15, %dl
+; AVX512F-32-NEXT: movl %edx, %eax
+; AVX512F-32-NEXT: shrb $2, %dl
+; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ebx, %eax
-; AVX512F-32-NEXT: shrb $4, %al
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: shrl $28, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ebx, %eax
-; AVX512F-32-NEXT: shrb $5, %al
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: movl %ecx, %esi
+; AVX512F-32-NEXT: shrl $29, %eax
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ebx, %eax
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm2
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %esi, %eax
+; AVX512F-32-NEXT: shrl $30, %eax
+; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %esi, %eax
+; AVX512F-32-NEXT: shrl $31, %eax
+; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT: kmovd %ecx, %k1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm7
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm7, %ymm1
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: andb $2, %al
+; AVX512F-32-NEXT: shrb %al
+; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ecx, %edx
+; AVX512F-32-NEXT: andb $15, %dl
+; AVX512F-32-NEXT: movl %edx, %eax
+; AVX512F-32-NEXT: shrb $2, %dl
+; AVX512F-32-NEXT: kmovd %edx, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: shrb $3, %al
+; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: shrb $4, %al
+; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: shrb $5, %al
+; AVX512F-32-NEXT: andb $1, %al
+; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrb $6, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ebx, %eax
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrb $7, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %bh, %al
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movb %ch, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
; AVX512F-32-NEXT: andb $2, %al
; AVX512F-32-NEXT: shrb %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %bh, %cl
-; AVX512F-32-NEXT: andb $15, %cl
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrb $2, %cl
-; AVX512F-32-NEXT: kmovd %ecx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movb %ch, %dl
+; AVX512F-32-NEXT: andb $15, %dl
+; AVX512F-32-NEXT: movl %edx, %eax
+; AVX512F-32-NEXT: shrb $2, %dl
+; AVX512F-32-NEXT: kmovd %edx, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
; AVX512F-32-NEXT: shrb $3, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: bextrl %edi, %ebx, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ebx, %eax
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: shrl $12, %eax
+; AVX512F-32-NEXT: andl $15, %eax
+; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $13, %eax
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: bextrl %edx, %ebx, %eax
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: shrl $14, %eax
+; AVX512F-32-NEXT: andl $3, %eax
+; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: shrl $15, %eax
+; AVX512F-32-NEXT: andl $1, %eax
+; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ecx, %ebx
+; AVX512F-32-NEXT: shrl $16, %ebx
+; AVX512F-32-NEXT: kmovd %ebx, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ebx, %edx
+; AVX512F-32-NEXT: andb $2, %dl
+; AVX512F-32-NEXT: shrb %dl
+; AVX512F-32-NEXT: kmovd %edx, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0
+; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: movl %ebx, %eax
+; AVX512F-32-NEXT: andb $15, %al
+; AVX512F-32-NEXT: movl %eax, %edx
+; AVX512F-32-NEXT: shrb $2, %al
+; AVX512F-32-NEXT: kmovd %eax, %k0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: shrb $3, %dl
+; AVX512F-32-NEXT: kmovd %edx, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0
+; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ebx, %eax
+; AVX512F-32-NEXT: shrb $4, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: bextrl %esi, %ebx, %eax
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
; AVX512F-32-NEXT: movl %ebx, %eax
-; AVX512F-32-NEXT: shrl $16, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %eax, %ecx
-; AVX512F-32-NEXT: andb $2, %cl
-; AVX512F-32-NEXT: shrb %cl
-; AVX512F-32-NEXT: kmovd %ecx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: movl %eax, %ecx
-; AVX512F-32-NEXT: andb $15, %cl
-; AVX512F-32-NEXT: movl %ecx, %edx
-; AVX512F-32-NEXT: shrb $2, %cl
-; AVX512F-32-NEXT: kmovd %ecx, %k0
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: shrb $3, %dl
-; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %eax, %ecx
-; AVX512F-32-NEXT: shrb $4, %cl
-; AVX512F-32-NEXT: kmovd %ecx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %eax, %ecx
-; AVX512F-32-NEXT: shrb $5, %cl
-; AVX512F-32-NEXT: andb $1, %cl
-; AVX512F-32-NEXT: kmovd %ecx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %eax, %ecx
-; AVX512F-32-NEXT: shrb $6, %cl
-; AVX512F-32-NEXT: kmovd %ecx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> %EAX<def>
-; AVX512F-32-NEXT: shrb $7, %al
+; AVX512F-32-NEXT: shrb $5, %al
+; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0
+; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
; AVX512F-32-NEXT: movl %ebx, %eax
-; AVX512F-32-NEXT: shrl $24, %eax
+; AVX512F-32-NEXT: shrb $6, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %eax, %ecx
-; AVX512F-32-NEXT: andb $2, %cl
-; AVX512F-32-NEXT: shrb %cl
-; AVX512F-32-NEXT: kmovd %ecx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: movl %eax, %ecx
-; AVX512F-32-NEXT: andb $15, %cl
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: # kill: %BL<def> %BL<kill> %EBX<kill> %EBX<def>
+; AVX512F-32-NEXT: shrb $7, %bl
+; AVX512F-32-NEXT: kmovd %ebx, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0
+; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrb $2, %cl
-; AVX512F-32-NEXT: kmovd %ecx, %k0
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: shrl $24, %eax
+; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpbroadcastq %xmm0, %ymm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %eax, %edx
+; AVX512F-32-NEXT: andb $2, %dl
+; AVX512F-32-NEXT: shrb %dl
+; AVX512F-32-NEXT: kmovd %edx, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: movl %eax, %edx
+; AVX512F-32-NEXT: andb $15, %dl
+; AVX512F-32-NEXT: movl %edx, %eax
+; AVX512F-32-NEXT: shrb $2, %dl
+; AVX512F-32-NEXT: kmovd %edx, %k0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
; AVX512F-32-NEXT: shrb $3, %al
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm4
-; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
-; AVX512F-32-NEXT: movl %ebx, %eax
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm1, %ymm1
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $29, %eax
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: movl %ebx, %eax
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $28, %eax
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4
-; AVX512F-32-NEXT: vpbroadcastd %xmm4, %xmm4
-; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
-; AVX512F-32-NEXT: movl %ebx, %eax
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3
+; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $30, %eax
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT: vpbroadcastw %xmm4, %xmm4
-; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm3
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ebx, %eax
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3
+; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $31, %eax
; AVX512F-32-NEXT: kshiftlq $1, %k0, %k0
; AVX512F-32-NEXT: kshiftrq $1, %k0, %k0
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
; AVX512F-32-NEXT: korq %k1, %k0, %k1
-; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: vpcmpeqb %zmm6, %zmm5, %k0 {%k1}
+; AVX512F-32-NEXT: vpcmpgtb %zmm5, %zmm6, %k2 {%k1}
+; AVX512F-32-NEXT: vpcmpleb %zmm6, %zmm5, %k3 {%k1}
+; AVX512F-32-NEXT: vpcmpneqb %zmm6, %zmm5, %k4 {%k1}
+; AVX512F-32-NEXT: vpcmpleb %zmm5, %zmm6, %k5 {%k1}
+; AVX512F-32-NEXT: vpcmpgtb %zmm6, %zmm5, %k1 {%k1}
; AVX512F-32-NEXT: kmovq %k0, (%esp)
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: vpcmpgtb %zmm0, %zmm1, %k0 {%k1}
-; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: kmovq %k2, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: vpcmpleb %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: kmovq %k3, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: kxorq %k0, %k0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT: vpcmpleb %zmm0, %zmm1, %k2 {%k1}
-; AVX512F-32-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 {%k1}
-; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: kmovq %k4, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: kmovq %k2, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: kmovq %k5, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: kmovq %k1, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: addl %ebp, %eax
-; AVX512F-32-NEXT: adcxl %ebx, %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: addl %esi, %eax
+; AVX512F-32-NEXT: adcl %ecx, %edx
; AVX512F-32-NEXT: addl $60, %esp
; AVX512F-32-NEXT: popl %esi
-; AVX512F-32-NEXT: popl %edi
; AVX512F-32-NEXT: popl %ebx
-; AVX512F-32-NEXT: popl %ebp
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
%res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
@@ -2537,12 +2560,13 @@ define i64 @test_ucmp_b_512(<64 x i8> %a
; AVX512BW-NEXT: kxnorq %k0, %k0, %k0
; AVX512BW-NEXT: kmovq %k0, %rax
; AVX512BW-NEXT: addq %rcx, %rax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_ucmp_b_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: subl $60, %esp
-; AVX512F-32-NEXT: .Lcfi14:
+; AVX512F-32-NEXT: .Lcfi10:
; AVX512F-32-NEXT: .cfi_def_cfa_offset 64
; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
@@ -2551,28 +2575,29 @@ define i64 @test_ucmp_b_512(<64 x i8> %a
; AVX512F-32-NEXT: vpcmpltub %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpleub %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpnltub %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: vpcmpnleub %zmm1, %zmm0, %k0
; AVX512F-32-NEXT: kmovq %k0, (%esp)
; AVX512F-32-NEXT: addl (%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: kxnorq %k0, %k0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: addl $60, %esp
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
%res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
@@ -2617,33 +2642,26 @@ define i64 @test_mask_x86_avx512_ucmp_b_
; AVX512BW-NEXT: kmovq %k0, %rax
; AVX512BW-NEXT: addq %rcx, %rax
; AVX512BW-NEXT: addq %rdi, %rax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_x86_avx512_ucmp_b_512:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: pushl %ebp
-; AVX512F-32-NEXT: .Lcfi15:
-; AVX512F-32-NEXT: .cfi_def_cfa_offset 8
; AVX512F-32-NEXT: pushl %ebx
-; AVX512F-32-NEXT: .Lcfi16:
-; AVX512F-32-NEXT: .cfi_def_cfa_offset 12
-; AVX512F-32-NEXT: pushl %edi
-; AVX512F-32-NEXT: .Lcfi17:
-; AVX512F-32-NEXT: .cfi_def_cfa_offset 16
+; AVX512F-32-NEXT: .Lcfi11:
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 8
; AVX512F-32-NEXT: pushl %esi
-; AVX512F-32-NEXT: .Lcfi18:
-; AVX512F-32-NEXT: .cfi_def_cfa_offset 20
+; AVX512F-32-NEXT: .Lcfi12:
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 12
; AVX512F-32-NEXT: subl $60, %esp
-; AVX512F-32-NEXT: .Lcfi19:
-; AVX512F-32-NEXT: .cfi_def_cfa_offset 80
-; AVX512F-32-NEXT: .Lcfi20:
-; AVX512F-32-NEXT: .cfi_offset %esi, -20
-; AVX512F-32-NEXT: .Lcfi21:
-; AVX512F-32-NEXT: .cfi_offset %edi, -16
-; AVX512F-32-NEXT: .Lcfi22:
-; AVX512F-32-NEXT: .cfi_offset %ebx, -12
-; AVX512F-32-NEXT: .Lcfi23:
-; AVX512F-32-NEXT: .cfi_offset %ebp, -8
+; AVX512F-32-NEXT: .Lcfi13:
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 72
+; AVX512F-32-NEXT: .Lcfi14:
+; AVX512F-32-NEXT: .cfi_offset %esi, -12
+; AVX512F-32-NEXT: .Lcfi15:
+; AVX512F-32-NEXT: .cfi_offset %ebx, -8
+; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512F-32-NEXT: vmovdqa64 %zmm0, %zmm5
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrb $5, %al
@@ -2664,39 +2682,39 @@ define i64 @test_mask_x86_avx512_ucmp_b_
; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
; AVX512F-32-NEXT: kmovd %ecx, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: kmovd %edx, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
; AVX512F-32-NEXT: vpslld $24, %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: kmovd %ebx, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
; AVX512F-32-NEXT: kmovd %eax, %k0
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
; AVX512F-32-NEXT: vpsllq $40, %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm3
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[4,5,6,7]
; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrb $6, %al
@@ -2705,8 +2723,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
@@ -2715,8 +2733,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %al
@@ -2724,8 +2742,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: andb $2, %al
@@ -2734,8 +2752,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movb %ch, %dl
@@ -2746,8 +2764,8 @@ define i64 @test_mask_x86_avx512_ucmp_b_
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %al
@@ -2755,631 +2773,639 @@ define i64 @test_mask_x86_avx512_ucmp_b_
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl $1036, %edi # imm = 0x40C
-; AVX512F-32-NEXT: bextrl %edi, %ecx, %eax
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: shrl $12, %eax
+; AVX512F-32-NEXT: andl $15, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: movl %ecx, %ebp
; AVX512F-32-NEXT: shrl $13, %eax
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl $526, %edx # imm = 0x20E
-; AVX512F-32-NEXT: bextrl %edx, %ebp, %eax
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: shrl $14, %eax
+; AVX512F-32-NEXT: andl $3, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
-; AVX512F-32-NEXT: movl $271, %esi # imm = 0x10F
-; AVX512F-32-NEXT: bextrl %esi, %ebp, %eax
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: shrl $15, %eax
+; AVX512F-32-NEXT: andl $1, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: shrl $16, %ecx
-; AVX512F-32-NEXT: kmovd %ecx, %k1
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: shrl $16, %eax
+; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: andb $2, %al
-; AVX512F-32-NEXT: shrb %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: movl %eax, %edx
+; AVX512F-32-NEXT: andb $2, %dl
+; AVX512F-32-NEXT: shrb %dl
+; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ecx, %ebx
+; AVX512F-32-NEXT: movl %eax, %ebx
; AVX512F-32-NEXT: andb $15, %bl
-; AVX512F-32-NEXT: movl %ebx, %eax
+; AVX512F-32-NEXT: movl %ebx, %edx
; AVX512F-32-NEXT: shrb $2, %bl
; AVX512F-32-NEXT: kmovd %ebx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: shrb $3, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: shrb $3, %dl
+; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrb $4, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: movl %eax, %edx
+; AVX512F-32-NEXT: shrb $4, %dl
+; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrb $5, %al
-; AVX512F-32-NEXT: andb $1, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: movl %eax, %edx
+; AVX512F-32-NEXT: shrb $5, %dl
+; AVX512F-32-NEXT: andb $1, %dl
+; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrb $6, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: movl %eax, %edx
+; AVX512F-32-NEXT: shrb $6, %dl
+; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: # kill: %CL<def> %CL<kill> %ECX<kill> %ECX<def>
-; AVX512F-32-NEXT: shrb $7, %cl
-; AVX512F-32-NEXT: kmovd %ecx, %k1
+; AVX512F-32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> %EAX<def>
+; AVX512F-32-NEXT: shrb $7, %al
+; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ebp, %ebx
-; AVX512F-32-NEXT: shrl $24, %ebx
-; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm3
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm3, %k0
-; AVX512F-32-NEXT: movl %ebx, %eax
-; AVX512F-32-NEXT: andb $2, %al
-; AVX512F-32-NEXT: shrb %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm5
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm4
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm4, %k0
-; AVX512F-32-NEXT: # kill: %BL<def> %BL<kill> %EBX<kill> %EBX<def>
-; AVX512F-32-NEXT: andb $15, %bl
-; AVX512F-32-NEXT: movl %ebx, %eax
-; AVX512F-32-NEXT: shrb $2, %bl
-; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4
-; AVX512F-32-NEXT: vpbroadcastw %xmm4, %xmm4
-; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm5
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm6
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm6, %ymm5, %ymm5
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0
-; AVX512F-32-NEXT: shrb $3, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2,3,4]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm7
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0
-; AVX512F-32-NEXT: movl %ebp, %ecx
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrl $28, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5
-; AVX512F-32-NEXT: vpbroadcastd %xmm5, %xmm5
-; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm7
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm7, %ymm5, %ymm5
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm5, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrl $29, %eax
-; AVX512F-32-NEXT: andb $1, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm5
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm5[0,1,2]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm7
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm2, %ymm7, %ymm7
-; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7]
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ebp, %eax
-; AVX512F-32-NEXT: shrl $30, %eax
+; AVX512F-32-NEXT: shrl $24, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ebp, %eax
-; AVX512F-32-NEXT: shrl $31, %eax
-; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: movl %eax, %edx
+; AVX512F-32-NEXT: andb $2, %dl
+; AVX512F-32-NEXT: shrb %dl
+; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
-; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; AVX512F-32-NEXT: kmovd %ebx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ebx, %eax
-; AVX512F-32-NEXT: andb $2, %al
-; AVX512F-32-NEXT: shrb %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ebx, %ecx
-; AVX512F-32-NEXT: andb $15, %cl
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrb $2, %cl
-; AVX512F-32-NEXT: kmovd %ecx, %k1
+; AVX512F-32-NEXT: movl %eax, %edx
+; AVX512F-32-NEXT: andb $15, %dl
+; AVX512F-32-NEXT: movl %edx, %eax
+; AVX512F-32-NEXT: shrb $2, %dl
+; AVX512F-32-NEXT: kmovd %edx, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
; AVX512F-32-NEXT: shrb $3, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ebx, %eax
-; AVX512F-32-NEXT: shrb $4, %al
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: shrl $28, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm4[4,5,6,7]
; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ebx, %eax
-; AVX512F-32-NEXT: shrb $5, %al
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: movl %ecx, %esi
+; AVX512F-32-NEXT: shrl $29, %eax
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ebx, %eax
-; AVX512F-32-NEXT: shrb $6, %al
-; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ebx, %eax
-; AVX512F-32-NEXT: shrb $7, %al
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm2
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %esi, %eax
+; AVX512F-32-NEXT: shrl $30, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %bh, %al
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255]
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm1
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %esi, %eax
+; AVX512F-32-NEXT: shrl $31, %eax
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
+; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT: kmovd %ecx, %k1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm7
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm7, %ymm1
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: andb $2, %al
; AVX512F-32-NEXT: shrb %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movb %bh, %cl
-; AVX512F-32-NEXT: andb $15, %cl
-; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrb $2, %cl
-; AVX512F-32-NEXT: kmovd %ecx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ecx, %edx
+; AVX512F-32-NEXT: andb $15, %dl
+; AVX512F-32-NEXT: movl %edx, %eax
+; AVX512F-32-NEXT: shrb $2, %dl
+; AVX512F-32-NEXT: kmovd %edx, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
; AVX512F-32-NEXT: shrb $3, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: bextrl %edi, %ebx, %eax
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: shrb $4, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ebx, %eax
-; AVX512F-32-NEXT: shrl $13, %eax
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: shrb $5, %al
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: bextrl %edx, %ebx, %eax
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: shrb $6, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: bextrl %esi, %ebx, %eax
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: shrb $7, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0]
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ebx, %eax
-; AVX512F-32-NEXT: shrl $16, %eax
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movb %ch, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %eax, %ecx
-; AVX512F-32-NEXT: andb $2, %cl
-; AVX512F-32-NEXT: shrb %cl
-; AVX512F-32-NEXT: kmovd %ecx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpsllw $8, %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: movl %eax, %ecx
-; AVX512F-32-NEXT: andb $15, %cl
-; AVX512F-32-NEXT: movl %ecx, %edx
-; AVX512F-32-NEXT: shrb $2, %cl
-; AVX512F-32-NEXT: kmovd %ecx, %k0
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: andb $2, %al
+; AVX512F-32-NEXT: shrb %al
+; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movb %ch, %dl
+; AVX512F-32-NEXT: andb $15, %dl
+; AVX512F-32-NEXT: movl %edx, %eax
+; AVX512F-32-NEXT: shrb $2, %dl
+; AVX512F-32-NEXT: kmovd %edx, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: shrb $3, %al
+; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: shrl $12, %eax
+; AVX512F-32-NEXT: andl $15, %eax
+; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: shrl $13, %eax
+; AVX512F-32-NEXT: andb $1, %al
+; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: shrl $14, %eax
+; AVX512F-32-NEXT: andl $3, %eax
+; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ecx, %eax
+; AVX512F-32-NEXT: shrl $15, %eax
+; AVX512F-32-NEXT: andl $1, %eax
+; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0]
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ecx, %ebx
+; AVX512F-32-NEXT: shrl $16, %ebx
+; AVX512F-32-NEXT: kmovd %ebx, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ebx, %edx
+; AVX512F-32-NEXT: andb $2, %dl
+; AVX512F-32-NEXT: shrb %dl
+; AVX512F-32-NEXT: kmovd %edx, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpsllw $8, %xmm0, %xmm0
+; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: movl %ebx, %eax
+; AVX512F-32-NEXT: andb $15, %al
+; AVX512F-32-NEXT: movl %eax, %edx
+; AVX512F-32-NEXT: shrb $2, %al
+; AVX512F-32-NEXT: kmovd %eax, %k0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
; AVX512F-32-NEXT: shrb $3, %dl
; AVX512F-32-NEXT: kmovd %edx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslld $24, %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %eax, %ecx
-; AVX512F-32-NEXT: shrb $4, %cl
-; AVX512F-32-NEXT: kmovd %ecx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %eax, %ecx
-; AVX512F-32-NEXT: shrb $5, %cl
-; AVX512F-32-NEXT: andb $1, %cl
-; AVX512F-32-NEXT: kmovd %ecx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpsllq $40, %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %eax, %ecx
-; AVX512F-32-NEXT: shrb $6, %cl
-; AVX512F-32-NEXT: kmovd %ecx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: # kill: %AL<def> %AL<kill> %EAX<kill> %EAX<def>
-; AVX512F-32-NEXT: shrb $7, %al
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpslld $24, %xmm0, %xmm0
+; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ebx, %eax
+; AVX512F-32-NEXT: shrb $4, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpsllq $56, %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
; AVX512F-32-NEXT: movl %ebx, %eax
-; AVX512F-32-NEXT: shrl $24, %eax
+; AVX512F-32-NEXT: shrb $5, %al
+; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %eax, %ecx
-; AVX512F-32-NEXT: andb $2, %cl
-; AVX512F-32-NEXT: shrb %cl
-; AVX512F-32-NEXT: kmovd %ecx, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: movl %eax, %ecx
-; AVX512F-32-NEXT: andb $15, %cl
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpsllq $40, %xmm0, %xmm0
+; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ebx, %eax
+; AVX512F-32-NEXT: shrb $6, %al
+; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: # kill: %BL<def> %BL<kill> %EBX<kill> %EBX<def>
+; AVX512F-32-NEXT: shrb $7, %bl
+; AVX512F-32-NEXT: kmovd %ebx, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpsllq $56, %xmm0, %xmm0
+; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
; AVX512F-32-NEXT: movl %ecx, %eax
-; AVX512F-32-NEXT: shrb $2, %cl
-; AVX512F-32-NEXT: kmovd %ecx, %k0
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512F-32-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm3, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm2
+; AVX512F-32-NEXT: shrl $24, %eax
+; AVX512F-32-NEXT: kmovd %eax, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpbroadcastq %xmm0, %ymm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %eax, %edx
+; AVX512F-32-NEXT: andb $2, %dl
+; AVX512F-32-NEXT: shrb %dl
+; AVX512F-32-NEXT: kmovd %edx, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: movl %eax, %edx
+; AVX512F-32-NEXT: andb $15, %dl
+; AVX512F-32-NEXT: movl %edx, %eax
+; AVX512F-32-NEXT: shrb $2, %dl
+; AVX512F-32-NEXT: kmovd %edx, %k0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX512F-32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
; AVX512F-32-NEXT: shrb $3, %al
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm4
-; AVX512F-32-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3
-; AVX512F-32-NEXT: movl %ebx, %eax
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm1
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm4
+; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm4, %ymm1, %ymm1
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $29, %eax
; AVX512F-32-NEXT: andb $1, %al
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: movl %ebx, %eax
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $28, %eax
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-32-NEXT: kmovd %eax, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm4
-; AVX512F-32-NEXT: vpbroadcastd %xmm4, %xmm4
-; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k1
-; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm4[0,1,2]
-; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
-; AVX512F-32-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
-; AVX512F-32-NEXT: movl %ebx, %eax
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm3
+; AVX512F-32-NEXT: vpbroadcastd %xmm3, %xmm3
+; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k1
+; AVX512F-32-NEXT: vpmovm2b %k1, %zmm0
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2]
+; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-32-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255]
+; AVX512F-32-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $30, %eax
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm2
-; AVX512F-32-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-32-NEXT: kmovd %eax, %k0
-; AVX512F-32-NEXT: vpmovm2b %k0, %zmm4
-; AVX512F-32-NEXT: vpbroadcastw %xmm4, %xmm4
-; AVX512F-32-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm4, %ymm3
-; AVX512F-32-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
-; AVX512F-32-NEXT: movl %ebx, %eax
+; AVX512F-32-NEXT: vpmovm2b %k0, %zmm3
+; AVX512F-32-NEXT: vpbroadcastw %xmm3, %xmm3
+; AVX512F-32-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512F-32-NEXT: vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
+; AVX512F-32-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: vpmovb2m %zmm0, %k0
+; AVX512F-32-NEXT: movl %ecx, %eax
; AVX512F-32-NEXT: shrl $31, %eax
; AVX512F-32-NEXT: kshiftlq $1, %k0, %k0
; AVX512F-32-NEXT: kshiftrq $1, %k0, %k0
; AVX512F-32-NEXT: kmovd %eax, %k1
; AVX512F-32-NEXT: kshiftlq $63, %k1, %k1
; AVX512F-32-NEXT: korq %k1, %k0, %k1
-; AVX512F-32-NEXT: vpcmpeqb %zmm1, %zmm0, %k0 {%k1}
+; AVX512F-32-NEXT: vpcmpeqb %zmm6, %zmm5, %k0 {%k1}
+; AVX512F-32-NEXT: vpcmpltub %zmm6, %zmm5, %k2 {%k1}
+; AVX512F-32-NEXT: vpcmpleub %zmm6, %zmm5, %k3 {%k1}
+; AVX512F-32-NEXT: vpcmpneqb %zmm6, %zmm5, %k4 {%k1}
+; AVX512F-32-NEXT: vpcmpnltub %zmm6, %zmm5, %k5 {%k1}
+; AVX512F-32-NEXT: vpcmpnleub %zmm6, %zmm5, %k1 {%k1}
; AVX512F-32-NEXT: kmovq %k0, (%esp)
; AVX512F-32-NEXT: movl (%esp), %eax
; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: vpcmpltub %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: kmovq %k2, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: vpcmpleub %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: kmovq %k3, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: kxorq %k0, %k0, %k0
; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: orl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1}
-; AVX512F-32-NEXT: vpcmpnltub %zmm1, %zmm0, %k2 {%k1}
-; AVX512F-32-NEXT: vpcmpnleub %zmm1, %zmm0, %k1 {%k1}
-; AVX512F-32-NEXT: kmovq %k0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: kmovq %k4, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: kmovq %k2, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: kmovq %k5, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
; AVX512F-32-NEXT: kmovq %k1, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: addl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT: adcxl {{[0-9]+}}(%esp), %edx
-; AVX512F-32-NEXT: addl %ebp, %eax
-; AVX512F-32-NEXT: adcxl %ebx, %edx
+; AVX512F-32-NEXT: adcl {{[0-9]+}}(%esp), %edx
+; AVX512F-32-NEXT: addl %esi, %eax
+; AVX512F-32-NEXT: adcl %ecx, %edx
; AVX512F-32-NEXT: addl $60, %esp
; AVX512F-32-NEXT: popl %esi
-; AVX512F-32-NEXT: popl %edi
; AVX512F-32-NEXT: popl %ebx
-; AVX512F-32-NEXT: popl %ebp
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
%res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
@@ -3424,6 +3450,7 @@ define i32 @test_cmp_w_512(<32 x i16> %a
; AVX512BW-NEXT: kxnord %k0, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: addl %ecx, %eax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_cmp_w_512:
@@ -3448,6 +3475,7 @@ define i32 @test_cmp_w_512(<32 x i16> %a
; AVX512F-32-NEXT: kxnord %k0, %k0, %k0
; AVX512F-32-NEXT: kmovd %k0, %eax
; AVX512F-32-NEXT: addl %ecx, %eax
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
%res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
@@ -3492,6 +3520,7 @@ define i32 @test_mask_cmp_w_512(<32 x i1
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: addl %ecx, %eax
; AVX512BW-NEXT: addl %edi, %eax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_cmp_w_512:
@@ -3519,6 +3548,7 @@ define i32 @test_mask_cmp_w_512(<32 x i1
; AVX512F-32-NEXT: kmovd %k0, %eax
; AVX512F-32-NEXT: addl %edx, %eax
; AVX512F-32-NEXT: addl %ecx, %eax
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
%res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
@@ -3563,6 +3593,7 @@ define i32 @test_ucmp_w_512(<32 x i16> %
; AVX512BW-NEXT: kxnord %k0, %k0, %k0
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: addl %ecx, %eax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_ucmp_w_512:
@@ -3587,6 +3618,7 @@ define i32 @test_ucmp_w_512(<32 x i16> %
; AVX512F-32-NEXT: kxnord %k0, %k0, %k0
; AVX512F-32-NEXT: kmovd %k0, %eax
; AVX512F-32-NEXT: addl %ecx, %eax
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
%res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
@@ -3631,6 +3663,7 @@ define i32 @test_mask_ucmp_w_512(<32 x i
; AVX512BW-NEXT: kmovd %k0, %eax
; AVX512BW-NEXT: addl %ecx, %eax
; AVX512BW-NEXT: addl %edi, %eax
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_mask_ucmp_w_512:
@@ -3658,6 +3691,7 @@ define i32 @test_mask_ucmp_w_512(<32 x i
; AVX512F-32-NEXT: kmovd %k0, %eax
; AVX512F-32-NEXT: addl %edx, %eax
; AVX512F-32-NEXT: addl %ecx, %eax
+; AVX512F-32-NEXT: vzeroupper
; AVX512F-32-NEXT: retl
%res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
%res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
Modified: llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll?rev=314306&r1=314305&r2=314306&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll Wed Sep 27 07:44:15 2017
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl --show-mc-encoding| FileCheck %s
declare <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8, <16 x i8>, i16)
@@ -9,8 +9,8 @@ define <16 x i8>@test_int_x86_avx512_mas
; CHECK-NEXT: vpbroadcastb %edi, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x7a,0xcf]
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpbroadcastb %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7a,0xc7]
-; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
-; CHECK-NEXT: vpbroadcastb %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7a,0xcf]
+; CHECK-NEXT: vpbroadcastb %edi, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7a,0xd7]
+; CHECK-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc2]
; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 -1)
@@ -30,8 +30,8 @@ define <8 x i16>@test_int_x86_avx512_mas
; CHECK-NEXT: vpbroadcastw %edi, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x7b,0xcf]
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpbroadcastw %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7b,0xc7]
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
-; CHECK-NEXT: vpbroadcastw %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7b,0xcf]
+; CHECK-NEXT: vpbroadcastw %edi, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7b,0xd7]
+; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 -1)
@@ -51,8 +51,8 @@ define <8 x i16>@test_int_x86_avx512_mas
; CHECK-NEXT: vpbroadcastb %edi, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x7a,0xcf]
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpbroadcastb %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7a,0xc7]
-; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0]
-; CHECK-NEXT: vpbroadcastb %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7a,0xcf]
+; CHECK-NEXT: vpbroadcastb %edi, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7a,0xd7]
+; CHECK-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc2]
; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 -1)
@@ -73,8 +73,8 @@ declare <16 x i16> @llvm.x86.avx512.mask
; CHECK-NEXT: vpbroadcastw %edi, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x7b,0xcf]
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpbroadcastw %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7b,0xc7]
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
-; CHECK-NEXT: vpbroadcastw %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7b,0xcf]
+; CHECK-NEXT: vpbroadcastw %edi, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7b,0xd7]
+; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2]
; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 -1)
@@ -93,9 +93,9 @@ define <32 x i8>@test_int_x86_avx512_pbr
; CHECK-NEXT: vpbroadcastb %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0xd0]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x78,0xc8]
-; CHECK-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc9]
; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0]
-; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0]
+; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1)
%res1 = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask)
@@ -113,9 +113,9 @@ define <16 x i8>@test_int_x86_avx512_pbr
; CHECK-NEXT: vpbroadcastb %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0xd0]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpbroadcastb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x78,0xc8]
-; CHECK-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc9]
; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0]
-; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
+; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1)
%res1 = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask)
@@ -133,9 +133,9 @@ define <16 x i16>@test_int_x86_avx512_pb
; CHECK-NEXT: vpbroadcastw %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xd0]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpbroadcastw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x79,0xc8]
-; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc9]
; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0]
-; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1)
%res1 = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask)
@@ -153,9 +153,9 @@ define <8 x i16>@test_int_x86_avx512_pbr
; CHECK-NEXT: vpbroadcastw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0xd0]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpbroadcastw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x79,0xc8]
-; CHECK-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc9]
; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0]
-; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1)
%res1 = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask)
@@ -173,9 +173,9 @@ define <64 x i8>@test_int_x86_avx512_pbr
; CHECK-NEXT: vpbroadcastb %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x78,0xd0]
; CHECK-NEXT: kmovq %rdi, %k1 ## encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
; CHECK-NEXT: vpbroadcastb %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8]
-; CHECK-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc9]
; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0]
-; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1]
+; CHECK-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0]
+; CHECK-NEXT: vpaddb %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1)
%res1 = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask)
@@ -193,9 +193,9 @@ define <32 x i16>@test_int_x86_avx512_pb
; CHECK-NEXT: vpbroadcastw %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x79,0xd0]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpbroadcastw %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8]
-; CHECK-NEXT: vpaddw %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc9]
; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0]
-; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1)
%res1 = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask)
@@ -227,6 +227,7 @@ define void at test_int_x86_avx512_mask_sto
; CHECK-NEXT: kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca]
; CHECK-NEXT: vmovdqu8 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x7f,0x07]
; CHECK-NEXT: vmovdqu %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x06]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.storeu.b.256(i8* %ptr1, <32 x i8> %x1, i32 %x2)
call void @llvm.x86.avx512.mask.storeu.b.256(i8* %ptr2, <32 x i8> %x1, i32 -1)
@@ -255,6 +256,7 @@ define void at test_int_x86_avx512_mask_sto
; CHECK-NEXT: kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca]
; CHECK-NEXT: vmovdqu16 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x7f,0x07]
; CHECK-NEXT: vmovdqu %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x06]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
call void @llvm.x86.avx512.mask.storeu.w.256(i8* %ptr1, <16 x i16> %x1, i16 %x2)
call void @llvm.x86.avx512.mask.storeu.w.256(i8* %ptr2, <16 x i16> %x1, i16 -1)
@@ -345,8 +347,8 @@ define <16 x i8>@test_int_x86_avx512_mas
; CHECK-NEXT: ## xmm2 {%k1} = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1]
; CHECK-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x0f,0xc1,0x02]
; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1]
-; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
; CHECK-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3]
+; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 %x4)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> zeroinitializer, i16 %x4)
@@ -368,8 +370,8 @@ define <32 x i8>@test_int_x86_avx512_mas
; CHECK-NEXT: ## ymm2 {%k1} = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17]
; CHECK-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x0f,0xc1,0x02]
; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17]
-; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0]
; CHECK-NEXT: vpaddb %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3]
+; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 %x4)
%res1 = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> zeroinitializer, i32 %x4)
@@ -391,8 +393,8 @@ define <8 x i16>@test_int_x86_avx512_mas
; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,1,2,3,7,4,4,4]
; CHECK-NEXT: vpshufhw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x70,0xc0,0x03]
; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,1,2,3,7,4,4,4]
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3)
@@ -414,8 +416,8 @@ define <16 x i16>@test_int_x86_avx512_ma
; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12]
; CHECK-NEXT: vpshufhw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x70,0xc0,0x03]
; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12]
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3)
@@ -437,8 +439,8 @@ define <8 x i16>@test_int_x86_avx512_mas
; CHECK-NEXT: ## xmm1 {%k1} = xmm0[3,0,0,0,4,5,6,7]
; CHECK-NEXT: vpshuflw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0x89,0x70,0xc0,0x03]
; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[3,0,0,0,4,5,6,7]
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3)
@@ -460,8 +462,8 @@ define <16 x i16>@test_int_x86_avx512_ma
; CHECK-NEXT: ## ymm1 {%k1} = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15]
; CHECK-NEXT: vpshuflw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0xa9,0x70,0xc0,0x03]
; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15]
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3)
@@ -476,6 +478,7 @@ define i32 @test_pcmpeq_b_256(<32 x i8>
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x74,0xc1]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
ret i32 %res
@@ -487,6 +490,7 @@ define i32 @test_mask_pcmpeq_b_256(<32 x
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x74,0xc1]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
ret i32 %res
@@ -500,6 +504,7 @@ define i16 @test_pcmpeq_w_256(<16 x i16>
; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
ret i16 %res
@@ -512,6 +517,7 @@ define i16 @test_mask_pcmpeq_w_256(<16 x
; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
ret i16 %res
@@ -524,6 +530,7 @@ define i32 @test_pcmpgt_b_256(<32 x i8>
; CHECK: ## BB#0:
; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x64,0xc1]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1)
ret i32 %res
@@ -535,6 +542,7 @@ define i32 @test_mask_pcmpgt_b_256(<32 x
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask)
ret i32 %res
@@ -548,6 +556,7 @@ define i16 @test_pcmpgt_w_256(<16 x i16>
; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x65,0xc1]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
ret i16 %res
@@ -560,6 +569,7 @@ define i16 @test_mask_pcmpgt_w_256(<16 x
; CHECK-NEXT: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc1]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
; CHECK-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
ret i16 %res
@@ -1660,9 +1670,9 @@ define <8 x i16>@test_int_x86_avx512_mas
; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd1,0xd1]
-; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xd3]
; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd1,0xc1]
-; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
@@ -1680,8 +1690,8 @@ define <16 x i16>@test_int_x86_avx512_ma
; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd1,0xd1]
-; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xd3]
; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd1,0xc1]
+; CHECK-NEXT: vpaddw %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfd,0xc0]
; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3)
@@ -1701,8 +1711,8 @@ define <8 x i16>@test_int_x86_avx512_mas
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe1,0xd1]
; CHECK-NEXT: vpsraw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe1,0xc1]
-; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
; CHECK-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.psra.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
@@ -1721,8 +1731,8 @@ define <16 x i16>@test_int_x86_avx512_ma
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe1,0xd1]
; CHECK-NEXT: vpsraw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe1,0xc1]
-; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
; CHECK-NEXT: vpaddw %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.psra.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
@@ -1741,8 +1751,8 @@ define <8 x i16>@test_int_x86_avx512_mas
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf1,0xd1]
; CHECK-NEXT: vpsllw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf1,0xc1]
-; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
; CHECK-NEXT: vpaddw %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psll.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.psll.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
@@ -1761,8 +1771,8 @@ define <16 x i16>@test_int_x86_avx512_ma
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf1,0xd1]
; CHECK-NEXT: vpsllw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf1,0xc1]
-; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
; CHECK-NEXT: vpaddw %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psll.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.psll.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
@@ -1780,9 +1790,9 @@ define <8 x i16>@test_int_x86_avx512_mas
; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xd0,0x03]
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xd0,0x03]
-; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xca]
; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x03]
-; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 -1)
@@ -1800,8 +1810,8 @@ define <16 x i16>@test_int_x86_avx512_ma
; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xd0,0x03]
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xd0,0x03]
-; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xca]
; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x03]
+; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
@@ -1821,8 +1831,8 @@ define <8 x i16>@test_int_x86_avx512_mas
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsraw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xe0,0x03]
; CHECK-NEXT: vpsraw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xe0,0x03]
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.psra.wi.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3)
@@ -1841,8 +1851,8 @@ define <16 x i16>@test_int_x86_avx512_ma
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsraw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xe0,0x03]
; CHECK-NEXT: vpsraw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xe0,0x03]
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.psra.wi.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3)
@@ -1861,8 +1871,8 @@ define <8 x i16>@test_int_x86_avx512_mas
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsllw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xf0,0x03]
; CHECK-NEXT: vpsllw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xf0,0x03]
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.psll.wi.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3)
@@ -1881,8 +1891,8 @@ define <16 x i16>@test_int_x86_avx512_ma
; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
; CHECK-NEXT: vpsllw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xf0,0x03]
; CHECK-NEXT: vpsllw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xf0,0x03]
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.psll.wi.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3)
@@ -1936,8 +1946,8 @@ define <8 x i16>@test_int_x86_avx512_mas
; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; CHECK-NEXT: vpmovzxbw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x30,0xc0]
; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -1959,8 +1969,8 @@ define <16 x i16>@test_int_x86_avx512_ma
; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; CHECK-NEXT: vpmovzxbw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x30,0xc0]
; CHECK-NEXT: ## ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> zeroinitializer, i16 %x2)
@@ -1980,8 +1990,8 @@ define <8 x i16>@test_int_x86_avx512_mas
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmovsxbw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x20,0xc8]
; CHECK-NEXT: vpmovsxbw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x20,0xc0]
-; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> zeroinitializer, i8 %x2)
@@ -2000,8 +2010,8 @@ define <16 x i16>@test_int_x86_avx512_ma
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmovsxbw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x20,0xc8]
; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x20,0xc0]
-; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2]
+; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> zeroinitializer, i16 %x2)
@@ -2020,8 +2030,8 @@ define <2 x i64>@test_int_x86_avx512_mas
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmovsxdq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x25,0xc8]
; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x25,0xc0]
-; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
; CHECK-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2]
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2)
%res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> zeroinitializer, i8 %x2)
@@ -2040,8 +2050,8 @@ define <4 x i64>@test_int_x86_avx512_mas
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpmovsxdq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x25,0xc8]
; CHECK-NEXT: vpmovsxdq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x25,0xc0]
-; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2]
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2)
%res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> zeroinitializer, i8 %x2)
@@ -2834,23 +2844,23 @@ define <8 x i32> @test_mask_cmp_b_256(<3
; CHECK-NEXT: vpcmpgtb %ymm0, %ymm1, %k0 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x64,0xc0]
; CHECK-NEXT: kmovd %k0, %r9d ## encoding: [0xc5,0x7b,0x93,0xc8]
; CHECK-NEXT: vpcmpleb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x02]
-; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
-; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04]
+; CHECK-NEXT: kmovd %k0, %r10d ## encoding: [0xc5,0x7b,0x93,0xd0]
+; CHECK-NEXT: kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0]
; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
-; CHECK-NEXT: vpcmpleb %ymm0, %ymm1, %k0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x3f,0xc0,0x02]
+; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1]
+; CHECK-NEXT: vpcmpleb %ymm0, %ymm1, %k0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x3f,0xc0,0x02]
; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
-; CHECK-NEXT: vmovd %esi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6]
-; CHECK-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01]
-; CHECK-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x02]
+; CHECK-NEXT: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x64,0xc1]
+; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
+; CHECK-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
+; CHECK-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x01]
+; CHECK-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x02]
; CHECK-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x03]
; CHECK-NEXT: vmovd %r8d, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xc8]
; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
-; CHECK-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xca,0x02]
-; CHECK-NEXT: kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0]
-; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xc8,0x03]
+; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
+; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
@@ -2932,23 +2942,23 @@ define <8 x i32> @test_mask_ucmp_b_256(<
; CHECK-NEXT: vpcmpltub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x01]
; CHECK-NEXT: kmovd %k0, %r9d ## encoding: [0xc5,0x7b,0x93,0xc8]
; CHECK-NEXT: vpcmpleub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x02]
-; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
-; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04]
+; CHECK-NEXT: kmovd %k0, %r10d ## encoding: [0xc5,0x7b,0x93,0xd0]
+; CHECK-NEXT: kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0]
; CHECK-NEXT: kmovd %k0, %esi ## encoding: [0xc5,0xfb,0x93,0xf0]
-; CHECK-NEXT: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x05]
+; CHECK-NEXT: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3f,0xc1,0x04]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x06]
+; CHECK-NEXT: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x05]
; CHECK-NEXT: kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
-; CHECK-NEXT: vmovd %esi, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc6]
-; CHECK-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc0,0x01]
-; CHECK-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x02]
+; CHECK-NEXT: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x3e,0xc1,0x06]
+; CHECK-NEXT: kmovd %k0, %edx ## encoding: [0xc5,0xfb,0x93,0xd0]
+; CHECK-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
+; CHECK-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc1,0x01]
+; CHECK-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc2,0x02]
; CHECK-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x22,0xc7,0x03]
; CHECK-NEXT: vmovd %r8d, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xc1,0x79,0x6e,0xc8]
; CHECK-NEXT: vpinsrd $1, %r9d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xc9,0x01]
-; CHECK-NEXT: vpinsrd $2, %edx, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xca,0x02]
-; CHECK-NEXT: kxord %k0, %k0, %k0 ## encoding: [0xc4,0xe1,0xfd,0x47,0xc0]
-; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
-; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xc8,0x03]
+; CHECK-NEXT: vpinsrd $2, %r10d, %xmm1, %xmm1 ## encoding: [0xc4,0xc3,0x71,0x22,0xca,0x02]
+; CHECK-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x22,0xce,0x03]
; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x75,0x38,0xc0,0x01]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
@@ -2997,6 +3007,7 @@ define <8 x i16> @test_cmp_w_256(<16 x i
; CHECK-NEXT: kxnorw %k0, %k0, %k0 ## encoding: [0xc5,0xfc,0x46,0xc0]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
@@ -3043,6 +3054,7 @@ define <8 x i16> @test_mask_cmp_w_256(<1
; CHECK-NEXT: kmovd %k1, %eax ## encoding: [0xc5,0xfb,0x93,0xc1]
; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
; CHECK-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
@@ -3090,6 +3102,7 @@ define <8 x i16> @test_ucmp_w_256(<16 x
; CHECK-NEXT: kxnorw %k0, %k0, %k0 ## encoding: [0xc5,0xfc,0x46,0xc0]
; CHECK-NEXT: kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
; CHECK-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
@@ -3136,6 +3149,7 @@ define <8 x i16> @test_mask_ucmp_w_256(<
; CHECK-NEXT: kmovd %k1, %eax ## encoding: [0xc5,0xfb,0x93,0xc1]
; CHECK-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
; CHECK-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc7,0x07]
+; CHECK-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
%vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
More information about the llvm-commits
mailing list